1 // Copyright (c) 1997 James Clark
2 // See the file COPYING for copying permission.
3 
4 #ifdef __GNUG__
5 #pragma implementation
6 #endif
7 
8 #include "splib.h"
9 #include "CodingSystemKit.h"
10 #include "TranslateCodingSystem.h"
11 #ifdef SP_MULTI_BYTE
12 #include "UTF8CodingSystem.h"
13 #include "Fixed2CodingSystem.h"
14 #include "UnicodeCodingSystem.h"
15 #include "XMLCodingSystem.h"
16 #include "EUCJPCodingSystem.h"
17 #include "SJISCodingSystem.h"
18 #include "Big5CodingSystem.h"
19 #ifdef WIN32
20 #include "Win32CodingSystem.h"
21 #endif
22 #endif /* SP_MULTI_BYTE */
23 #include "IdentityCodingSystem.h"
24 #include "Owner.h"
25 
26 #include <ctype.h>
27 
28 #ifdef SP_NAMESPACE
29 namespace SP_NAMESPACE {
30 #endif
31 
32 #ifdef SP_MULTI_BYTE
33 const Char unicodeReplaceChar = 0xfffd;
34 #endif
35 
36 class CodingSystemKitImpl : public CodingSystemKit {
37 public:
38   CodingSystemKitImpl(const TranslateCodingSystem::Desc *);
39   CodingSystemKit *copy() const;
40   Char replacementChar() const;
41   const CodingSystem *
42     identityCodingSystem() const;
43   const InputCodingSystem *
44     identityInputCodingSystem() const;
45   const InputCodingSystem *
46     makeInputCodingSystem(const StringC &,
47 			  const CharsetInfo &,
48 			  Boolean isBctf,
49 			  const char *&) const;
50   const CodingSystem *
51     makeCodingSystem(const char *, Boolean isBctf) const;
52   enum CodingSystemId {
53     identity,
54     fixed2,
55     utf8,
56     unicode,
57     eucjp,
58     euccn,
59     euckr,
60     sjisBctf,
61     eucBctf,
62     sjis,
63     big5,
64     big5Bctf,
65     ansi,
66     oem,
67     maybeUnicode,
68     xml,
69     iso8859_1,
70     iso8859_2,
71     iso8859_3,
72     iso8859_4,
73     iso8859_5,
74     iso8859_6,
75     iso8859_7,
76     iso8859_8,
77     iso8859_9,
78     koi8_r
79   };
80   struct Entry {
81     const char *name;
82     CodingSystemId id;
83   };
84   static Boolean match(const StringC &s,
85 		       const CharsetInfo &charset,
86 		       const char *key);
87   static Boolean match(const char *s,
88 		       const char *key);
89 private:
90   const CodingSystem *
91     makeCodingSystem(CodingSystemId) const;
92   const Entry *firstEntry(Boolean isBctf) const;
93 #ifdef SP_MULTI_BYTE
94   UTF8CodingSystem utf8CodingSystem_;
95   Fixed2CodingSystem fixed2CodingSystem_;
96   UnicodeCodingSystem unicodeCodingSystem_;
97   XMLCodingSystem xmlCodingSystem_;
98   EUCJPCodingSystem eucBctf_;
99   SJISCodingSystem sjisBctf_;
100   Big5CodingSystem big5Bctf_;
101   TranslateCodingSystem eucjpCodingSystem_;
102   TranslateCodingSystem euccnCodingSystem_;
103   TranslateCodingSystem euckrCodingSystem_;
104   TranslateCodingSystem sjisCodingSystem_;
105   TranslateCodingSystem big5CodingSystem_;
106   TranslateCodingSystem iso8859_1CodingSystem_;
107   TranslateCodingSystem iso8859_2CodingSystem_;
108   TranslateCodingSystem iso8859_3CodingSystem_;
109   TranslateCodingSystem iso8859_4CodingSystem_;
110   TranslateCodingSystem iso8859_5CodingSystem_;
111   TranslateCodingSystem iso8859_6CodingSystem_;
112   TranslateCodingSystem iso8859_7CodingSystem_;
113   TranslateCodingSystem iso8859_8CodingSystem_;
114   TranslateCodingSystem iso8859_9CodingSystem_;
115   TranslateCodingSystem koi8_rCodingSystem_;
116 #ifdef WIN32
117   Win32CodingSystem ansiCodingSystem_;
118   Win32CodingSystem oemCodingSystem_;
119   UnicodeCodingSystem maybeUnicodeCodingSystem_;
120 #endif
121 #endif /* SP_MULTI_BYTE */
122   IdentityCodingSystem identityCodingSystem_;
123   const TranslateCodingSystem::Desc *systemCharsetDesc_;
124   static const Entry bctfTable_[];
125   enum { nEncodingsRequireUnicode = 8 };
126   static const Entry encodingTable_[];
127 };
128 
129 
130 static const TranslateCodingSystem::Desc iso10646Desc[] = {
131   { CharsetRegistry::ISO10646_UCS2, 0x0 },
132   { CharsetRegistry::UNREGISTERED, 0x0 },
133 };
134 
135 #ifdef SP_MULTI_BYTE
136 
137 static const TranslateCodingSystem::Desc jisDesc[] = {
138   { CharsetRegistry::ISO646_C0, 0x0 },
139   { CharsetRegistry::ISO646_JIS_G0, 0x0 },
140   { CharsetRegistry::ISO6429, 0x80 },
141   { CharsetRegistry::JIS0201, 0x80 },
142   { CharsetRegistry::JIS0208, 0x8080 },
143   { CharsetRegistry::UNREGISTERED, 0x0 }
144 };
145 
146 static const TranslateCodingSystem::Desc jis2Desc[] = {
147   { CharsetRegistry::ISO646_C0, 0x0 },
148   { CharsetRegistry::ISO646_JIS_G0, 0x0 },
149   { CharsetRegistry::ISO6429, 0x80 },
150   { CharsetRegistry::JIS0201, 0x80 },
151   { CharsetRegistry::JIS0208, 0x8080 },
152   { CharsetRegistry::JIS0212, 0x8000 },
153   { CharsetRegistry::UNREGISTERED, 0x0 }
154 };
155 
156 static const TranslateCodingSystem::Desc gbDesc[] = {
157   { CharsetRegistry::ISO646_C0, 0x0 },
158   { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
159   { CharsetRegistry::ISO6429, 0x80 },
160   { CharsetRegistry::GB2312, 0x8080 },
161   { CharsetRegistry::UNREGISTERED, 0x0 }
162 };
163 
164 static const TranslateCodingSystem::Desc big5Desc[] = {
165   { CharsetRegistry::ISO646_C0, 0x0 },
166   { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
167   { CharsetRegistry::BIG5, 0x0 },
168   { CharsetRegistry::UNREGISTERED, 0x0 }
169 };
170 
171 static const TranslateCodingSystem::Desc kscDesc[] = {
172   { CharsetRegistry::ISO646_C0, 0x0 },
173   { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
174   { CharsetRegistry::ISO6429, 0x80 },
175   { CharsetRegistry::KSC5601, 0x8080 },
176   { CharsetRegistry::UNREGISTERED, 0x0 }
177 };
178 
179 static const TranslateCodingSystem::Desc iso8859_1Desc[] = {
180   { CharsetRegistry::ISO646_C0, 0x0 },
181   { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
182   { CharsetRegistry::ISO6429, 0x80 },
183   { CharsetRegistry::ISO8859_1, 0x80 },
184   { CharsetRegistry::UNREGISTERED, 0x0 }
185 };
186 
187 static const TranslateCodingSystem::Desc iso8859_2Desc[] = {
188   { CharsetRegistry::ISO646_C0, 0x0 },
189   { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
190   { CharsetRegistry::ISO6429, 0x80 },
191   { CharsetRegistry::ISO8859_2, 0x80 },
192   { CharsetRegistry::UNREGISTERED, 0x0 }
193 };
194 
195 static const TranslateCodingSystem::Desc iso8859_3Desc[] = {
196   { CharsetRegistry::ISO646_C0, 0x0 },
197   { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
198   { CharsetRegistry::ISO6429, 0x80 },
199   { CharsetRegistry::ISO8859_3, 0x80 },
200   { CharsetRegistry::UNREGISTERED, 0x0 }
201 };
202 
203 static const TranslateCodingSystem::Desc iso8859_4Desc[] = {
204   { CharsetRegistry::ISO646_C0, 0x0 },
205   { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
206   { CharsetRegistry::ISO6429, 0x80 },
207   { CharsetRegistry::ISO8859_4, 0x80 },
208   { CharsetRegistry::UNREGISTERED, 0x0 }
209 };
210 
211 static const TranslateCodingSystem::Desc iso8859_5Desc[] = {
212   { CharsetRegistry::ISO646_C0, 0x0 },
213   { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
214   { CharsetRegistry::ISO6429, 0x80 },
215   { CharsetRegistry::ISO8859_5, 0x80 },
216   { CharsetRegistry::UNREGISTERED, 0x0 }
217 };
218 
219 static const TranslateCodingSystem::Desc iso8859_6Desc[] = {
220   { CharsetRegistry::ISO646_C0, 0x0 },
221   { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
222   { CharsetRegistry::ISO6429, 0x80 },
223   { CharsetRegistry::ISO8859_6, 0x80 },
224   { CharsetRegistry::UNREGISTERED, 0x0 }
225 };
226 
227 static const TranslateCodingSystem::Desc iso8859_7Desc[] = {
228   { CharsetRegistry::ISO646_C0, 0x0 },
229   { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
230   { CharsetRegistry::ISO6429, 0x80 },
231   { CharsetRegistry::ISO8859_7, 0x80 },
232   { CharsetRegistry::UNREGISTERED, 0x0 }
233 };
234 
235 static const TranslateCodingSystem::Desc iso8859_8Desc[] = {
236   { CharsetRegistry::ISO646_C0, 0x0 },
237   { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
238   { CharsetRegistry::ISO6429, 0x80 },
239   { CharsetRegistry::ISO8859_8, 0x80 },
240   { CharsetRegistry::UNREGISTERED, 0x0 }
241 };
242 
243 static const TranslateCodingSystem::Desc iso8859_9Desc[] = {
244   { CharsetRegistry::ISO646_C0, 0x0 },
245   { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
246   { CharsetRegistry::ISO6429, 0x80 },
247   { CharsetRegistry::ISO8859_9, 0x80 },
248   { CharsetRegistry::UNREGISTERED, 0x0 }
249 };
250 
251 static const TranslateCodingSystem::Desc koi8_rDesc[] = {
252   { CharsetRegistry::ISO646_C0, 0x0 },
253   { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
254   // FIXME: only GR part of KOI8-R is handled (i.e. 160..255)
255   //        since koi8-r does not follow ISO control/graphic model
256   { CharsetRegistry::KOI8_R, 0x80 },
257   { CharsetRegistry::UNREGISTERED, 0x0 }
258 };
259 
260 #endif /* SP_MULTI_BYTE */
261 
262 const CodingSystemKitImpl::Entry CodingSystemKitImpl::bctfTable_[] = {
263   { "IDENTITY", identity },
264 #ifdef SP_MULTI_BYTE
265   { "FIXED-2", fixed2 },
266   { "UTF-8", utf8 },
267   { "EUC", eucBctf },
268   { "SJIS", sjisBctf },
269   { "BIG5", big5Bctf },
270 #endif /* SP_MULTI_BYTE */
271   { 0, identity },
272 };
273 
274 const CodingSystemKitImpl::Entry CodingSystemKitImpl::encodingTable_[] = {
275 #ifdef SP_MULTI_BYTE
276   { "UTF-8", utf8 },
277   { "UCS-2", fixed2 },
278   { "ISO-10646-UCS-2", fixed2 },
279   { "UNICODE", unicode },
280   // We don't really support UTF-16, but treating it
281   // as Unicode should work for the most part.
282   { "UTF-16", unicode },
283   { "WINDOWS", ansi },
284   { "MS-DOS", oem },
285   { "WUNICODE", maybeUnicode },
286   { "XML", xml },
287   // nEncodingsRequireUnicode = 8
288   { "IS8859-1", iso8859_1 },
289   { "ISO-8859-1", iso8859_1 },
290   { "IS8859-2", iso8859_2 },
291   { "ISO-8859-2", iso8859_2 },
292   { "IS8859-3", iso8859_3 },
293   { "ISO-8859-3", iso8859_3 },
294   { "IS8859-4", iso8859_4 },
295   { "ISO-8859-4", iso8859_4 },
296   { "IS8859-5", iso8859_5 },
297   { "ISO-8859-5", iso8859_5 },
298   { "IS8859-6", iso8859_6 },
299   { "ISO-8859-6", iso8859_6 },
300   { "IS8859-7", iso8859_7 },
301   { "ISO-8859-7", iso8859_7 },
302   { "IS8859-8", iso8859_8 },
303   { "ISO-8859-8", iso8859_8 },
304   { "IS8859-9", iso8859_9 },
305   { "ISO-8859-9", iso8859_9 },
306   { "KOI8-R", koi8_r }, // RFC 1489
307   { "KOI8", koi8_r },
308   { "EUC-JP", eucjp },
309   { "EUC-CN", euccn },
310   { "GB2312", euccn },
311   { "CN-GB", euccn },  // RFC 1922
312   { "EUC-KR", euckr },
313   { "SJIS", sjis },
314   { "SHIFT_JIS", sjis },
315   { "BIG5", big5 },
316   { "CN-BIG5", big5 }, // RFC 1922
317 #endif /* SP_MULTI_BYTE */
318   { 0, identity },
319 };
320 
CodingSystemKitImpl(const TranslateCodingSystem::Desc * systemCharsetDesc)321 CodingSystemKitImpl::CodingSystemKitImpl(const TranslateCodingSystem::Desc *systemCharsetDesc)
322 : systemCharsetDesc_(systemCharsetDesc)
323 #ifdef SP_MULTI_BYTE
324   ,
325 #ifdef WIN32
326   ansiCodingSystem_(Win32CodingSystem::codePageAnsi),
327   oemCodingSystem_(Win32CodingSystem::codePageOEM),
328   maybeUnicodeCodingSystem_(&ansiCodingSystem_),
329 #endif
330   xmlCodingSystem_(this),
331   iso8859_1CodingSystem_(&identityCodingSystem_, iso8859_1Desc, &systemCharset_, 0x100, unicodeReplaceChar),
332   iso8859_2CodingSystem_(&identityCodingSystem_, iso8859_2Desc, &systemCharset_, 0x100, unicodeReplaceChar),
333   iso8859_3CodingSystem_(&identityCodingSystem_, iso8859_3Desc, &systemCharset_, 0x100, unicodeReplaceChar),
334   iso8859_4CodingSystem_(&identityCodingSystem_, iso8859_4Desc, &systemCharset_, 0x100, unicodeReplaceChar),
335   iso8859_5CodingSystem_(&identityCodingSystem_, iso8859_5Desc, &systemCharset_, 0x100, unicodeReplaceChar),
336   iso8859_6CodingSystem_(&identityCodingSystem_, iso8859_6Desc, &systemCharset_, 0x100, unicodeReplaceChar),
337   iso8859_7CodingSystem_(&identityCodingSystem_, iso8859_7Desc, &systemCharset_, 0x100, unicodeReplaceChar),
338   iso8859_8CodingSystem_(&identityCodingSystem_, iso8859_8Desc, &systemCharset_, 0x100, unicodeReplaceChar),
339   iso8859_9CodingSystem_(&identityCodingSystem_, iso8859_9Desc, &systemCharset_, 0x100, unicodeReplaceChar),
340   koi8_rCodingSystem_(&identityCodingSystem_, koi8_rDesc, &systemCharset_, 0x100, unicodeReplaceChar),
341   eucjpCodingSystem_(&eucBctf_, jis2Desc, &systemCharset_, 0x8000, unicodeReplaceChar),
342   euccnCodingSystem_(&eucBctf_, gbDesc, &systemCharset_, 0x8000, unicodeReplaceChar),
343   euckrCodingSystem_(&eucBctf_, kscDesc, &systemCharset_, 0x8000, unicodeReplaceChar),
344   sjisCodingSystem_(&sjisBctf_, jisDesc, &systemCharset_, 0x8000, unicodeReplaceChar),
345   big5CodingSystem_(&big5Bctf_, big5Desc, &systemCharset_, 0x0080, unicodeReplaceChar)
346 #endif /* SP_MULTI_BYTE */
347 {
348   UnivCharsetDesc desc;
349   for (const TranslateCodingSystem::Desc *p = systemCharsetDesc_;
350        p->number != CharsetRegistry::UNREGISTERED;
351        p++) {
352     Owner<CharsetRegistry::Iter> iter(CharsetRegistry::makeIter(p->number));
353     if (iter) {
354       WideChar min;
355       WideChar max;
356       UnivChar univ;
357       while (iter->next(min, max, univ)) {
358 	min += p->add;
359 	max += p->add;
360 	if (min <= charMax) {
361 	  if (max > charMax)
362 	    max = charMax;
363 	  desc.addRange(min, max, univ);
364 	}
365       }
366     }
367   }
368   systemCharset_.set(desc);
369 }
370 
copy() const371 CodingSystemKit *CodingSystemKitImpl::copy() const
372 {
373   return new CodingSystemKitImpl(systemCharsetDesc_);
374 }
375 
firstEntry(Boolean isBctf) const376 const CodingSystemKitImpl::Entry *CodingSystemKitImpl::firstEntry(Boolean isBctf) const
377 {
378   if (isBctf)
379     return bctfTable_;
380 #ifdef SP_MULTI_BYTE
381   else if (systemCharsetDesc_ != iso10646Desc)
382     return encodingTable_ + nEncodingsRequireUnicode;
383 #endif
384   else
385     return encodingTable_;
386 }
387 
388 const InputCodingSystem *
makeInputCodingSystem(const StringC & s,const CharsetInfo & charset,Boolean isBctf,const char * & key) const389 CodingSystemKitImpl::makeInputCodingSystem(const StringC &s,
390 					   const CharsetInfo &charset,
391 					   Boolean isBctf,
392 					   const char *&key) const
393 {
394   for (const Entry *p = firstEntry(isBctf); p->name; p++)
395     if (match(s, charset, p->name)) {
396       key = p->name;
397       return makeCodingSystem(p->id);
398     }
399   return 0;
400 }
401 
402 Boolean
match(const StringC & s,const CharsetInfo & charset,const char * key)403 CodingSystemKitImpl::match(const StringC &s,
404 			   const CharsetInfo &charset,
405 			   const char *key)
406 {
407   for (size_t i = 0; i < s.size(); i++) {
408     if (key[i] == '\0')
409       return 0;
410     if (charset.execToDesc(toupper(key[i])) != s[i]
411         && charset.execToDesc(tolower(key[i])) != s[i])
412       return 0;
413   }
414   return key[s.size()] == '\0';
415 }
416 
417 const CodingSystem *
makeCodingSystem(const char * s,Boolean isBctf) const418 CodingSystemKitImpl::makeCodingSystem(const char *s,
419 				      Boolean isBctf)
420   const
421 {
422  for (const Entry *p = firstEntry(isBctf); p->name; p++)
423    if (match(s, p->name))
424       return makeCodingSystem(p->id);
425   return 0;
426 }
427 
428 Boolean
match(const char * s,const char * key)429 CodingSystemKitImpl::match(const char *s,
430 			   const char *key)
431 {
432   for (; toupper(*key) == *s || tolower(*key) == *s; s++, key++) {
433     if (*s == '\0')
434       return 1;
435   }
436   return 0;
437 }
438 
439 const CodingSystem *
makeCodingSystem(CodingSystemId id) const440 CodingSystemKitImpl::makeCodingSystem(CodingSystemId id) const
441 {
442   switch (id) {
443   case identity:
444     return &identityCodingSystem_;
445 #ifdef SP_MULTI_BYTE
446   case fixed2:
447     return &fixed2CodingSystem_;
448   case utf8:
449     return &utf8CodingSystem_;
450   case unicode:
451     return &unicodeCodingSystem_;
452   case eucBctf:
453     return &eucBctf_;
454   case sjisBctf:
455     return &sjisBctf_;
456   case big5Bctf:
457     return &big5Bctf_;
458   case eucjp:
459     return &eucjpCodingSystem_;
460   case euccn:
461     return &euccnCodingSystem_;
462   case euckr:
463     return &euckrCodingSystem_;
464   case sjis:
465     return &sjisCodingSystem_;
466   case big5:
467     return &big5CodingSystem_;
468   case iso8859_1:
469     if (systemCharsetDesc_ == iso10646Desc)
470       return &identityCodingSystem_;
471     else
472       return &iso8859_1CodingSystem_;
473   case iso8859_2:
474     return &iso8859_2CodingSystem_;
475   case iso8859_3:
476     return &iso8859_3CodingSystem_;
477   case iso8859_4:
478     return &iso8859_4CodingSystem_;
479   case iso8859_5:
480     return &iso8859_5CodingSystem_;
481   case iso8859_6:
482     return &iso8859_6CodingSystem_;
483   case iso8859_7:
484     return &iso8859_7CodingSystem_;
485   case iso8859_8:
486     return &iso8859_8CodingSystem_;
487   case iso8859_9:
488     return &iso8859_9CodingSystem_;
489   case koi8_r:
490     return &koi8_rCodingSystem_;
491   case xml:
492     return &xmlCodingSystem_;
493 #ifdef WIN32
494   case ansi:
495     return &ansiCodingSystem_;
496   case oem:
497     return &oemCodingSystem_;
498   case maybeUnicode:
499     return &maybeUnicodeCodingSystem_;
500 #endif /* WIN32 */
501 #endif /* SP_MULTI_BYTE */
502   default:
503     break;
504   }
505   return 0;
506 }
507 
508 const InputCodingSystem *
identityInputCodingSystem() const509 CodingSystemKitImpl::identityInputCodingSystem() const
510 {
511   return &identityCodingSystem_;
512 }
513 
514 const CodingSystem *
identityCodingSystem() const515 CodingSystemKitImpl::identityCodingSystem() const
516 {
517   return &identityCodingSystem_;
518 }
519 
replacementChar() const520 Char CodingSystemKitImpl::replacementChar() const
521 {
522   // FIXME should vary with systemCharset
523 #ifdef SP_MULTI_BYTE
524   return unicodeReplaceChar;
525 #else
526   return 0;
527 #endif
528 }
529 
530 CodingSystemKit *
make(const char * systemCharsetName)531 CodingSystemKit::make(const char *systemCharsetName)
532 {
533 #ifdef SP_MULTI_BYTE
534   if (systemCharsetName && CodingSystemKitImpl::match(systemCharsetName, "JIS"))
535     return new CodingSystemKitImpl(jis2Desc);
536 #endif
537   return new CodingSystemKitImpl(iso10646Desc);
538 }
539 
~InputCodingSystemKit()540 InputCodingSystemKit::~InputCodingSystemKit()
541 {
542 }
543 
544 #ifdef SP_NAMESPACE
545 }
546 #endif
547