1 // Copyright (c) 1997 James Clark
2 // See the file COPYING for copying permission.
3
4 #ifdef __GNUG__
5 #pragma implementation
6 #endif
7
8 #include "splib.h"
9 #include "CodingSystemKit.h"
10 #include "TranslateCodingSystem.h"
11 #ifdef SP_MULTI_BYTE
12 #include "UTF8CodingSystem.h"
13 #include "Fixed2CodingSystem.h"
14 #include "UnicodeCodingSystem.h"
15 #include "XMLCodingSystem.h"
16 #include "EUCJPCodingSystem.h"
17 #include "SJISCodingSystem.h"
18 #include "Big5CodingSystem.h"
19 #ifdef WIN32
20 #include "Win32CodingSystem.h"
21 #endif
22 #endif /* SP_MULTI_BYTE */
23 #include "IdentityCodingSystem.h"
24 #include "Owner.h"
25
26 #include <ctype.h>
27
28 #ifdef SP_NAMESPACE
29 namespace SP_NAMESPACE {
30 #endif
31
32 #ifdef SP_MULTI_BYTE
33 const Char unicodeReplaceChar = 0xfffd;
34 #endif
35
36 class CodingSystemKitImpl : public CodingSystemKit {
37 public:
38 CodingSystemKitImpl(const TranslateCodingSystem::Desc *);
39 CodingSystemKit *copy() const;
40 Char replacementChar() const;
41 const CodingSystem *
42 identityCodingSystem() const;
43 const InputCodingSystem *
44 identityInputCodingSystem() const;
45 const InputCodingSystem *
46 makeInputCodingSystem(const StringC &,
47 const CharsetInfo &,
48 Boolean isBctf,
49 const char *&) const;
50 const CodingSystem *
51 makeCodingSystem(const char *, Boolean isBctf) const;
52 enum CodingSystemId {
53 identity,
54 fixed2,
55 utf8,
56 unicode,
57 eucjp,
58 euccn,
59 euckr,
60 sjisBctf,
61 eucBctf,
62 sjis,
63 big5,
64 big5Bctf,
65 ansi,
66 oem,
67 maybeUnicode,
68 xml,
69 iso8859_1,
70 iso8859_2,
71 iso8859_3,
72 iso8859_4,
73 iso8859_5,
74 iso8859_6,
75 iso8859_7,
76 iso8859_8,
77 iso8859_9,
78 koi8_r
79 };
80 struct Entry {
81 const char *name;
82 CodingSystemId id;
83 };
84 static Boolean match(const StringC &s,
85 const CharsetInfo &charset,
86 const char *key);
87 static Boolean match(const char *s,
88 const char *key);
89 private:
90 const CodingSystem *
91 makeCodingSystem(CodingSystemId) const;
92 const Entry *firstEntry(Boolean isBctf) const;
93 #ifdef SP_MULTI_BYTE
94 UTF8CodingSystem utf8CodingSystem_;
95 Fixed2CodingSystem fixed2CodingSystem_;
96 UnicodeCodingSystem unicodeCodingSystem_;
97 XMLCodingSystem xmlCodingSystem_;
98 EUCJPCodingSystem eucBctf_;
99 SJISCodingSystem sjisBctf_;
100 Big5CodingSystem big5Bctf_;
101 TranslateCodingSystem eucjpCodingSystem_;
102 TranslateCodingSystem euccnCodingSystem_;
103 TranslateCodingSystem euckrCodingSystem_;
104 TranslateCodingSystem sjisCodingSystem_;
105 TranslateCodingSystem big5CodingSystem_;
106 TranslateCodingSystem iso8859_1CodingSystem_;
107 TranslateCodingSystem iso8859_2CodingSystem_;
108 TranslateCodingSystem iso8859_3CodingSystem_;
109 TranslateCodingSystem iso8859_4CodingSystem_;
110 TranslateCodingSystem iso8859_5CodingSystem_;
111 TranslateCodingSystem iso8859_6CodingSystem_;
112 TranslateCodingSystem iso8859_7CodingSystem_;
113 TranslateCodingSystem iso8859_8CodingSystem_;
114 TranslateCodingSystem iso8859_9CodingSystem_;
115 TranslateCodingSystem koi8_rCodingSystem_;
116 #ifdef WIN32
117 Win32CodingSystem ansiCodingSystem_;
118 Win32CodingSystem oemCodingSystem_;
119 UnicodeCodingSystem maybeUnicodeCodingSystem_;
120 #endif
121 #endif /* SP_MULTI_BYTE */
122 IdentityCodingSystem identityCodingSystem_;
123 const TranslateCodingSystem::Desc *systemCharsetDesc_;
124 static const Entry bctfTable_[];
125 enum { nEncodingsRequireUnicode = 8 };
126 static const Entry encodingTable_[];
127 };
128
129
130 static const TranslateCodingSystem::Desc iso10646Desc[] = {
131 { CharsetRegistry::ISO10646_UCS2, 0x0 },
132 { CharsetRegistry::UNREGISTERED, 0x0 },
133 };
134
135 #ifdef SP_MULTI_BYTE
136
137 static const TranslateCodingSystem::Desc jisDesc[] = {
138 { CharsetRegistry::ISO646_C0, 0x0 },
139 { CharsetRegistry::ISO646_JIS_G0, 0x0 },
140 { CharsetRegistry::ISO6429, 0x80 },
141 { CharsetRegistry::JIS0201, 0x80 },
142 { CharsetRegistry::JIS0208, 0x8080 },
143 { CharsetRegistry::UNREGISTERED, 0x0 }
144 };
145
146 static const TranslateCodingSystem::Desc jis2Desc[] = {
147 { CharsetRegistry::ISO646_C0, 0x0 },
148 { CharsetRegistry::ISO646_JIS_G0, 0x0 },
149 { CharsetRegistry::ISO6429, 0x80 },
150 { CharsetRegistry::JIS0201, 0x80 },
151 { CharsetRegistry::JIS0208, 0x8080 },
152 { CharsetRegistry::JIS0212, 0x8000 },
153 { CharsetRegistry::UNREGISTERED, 0x0 }
154 };
155
156 static const TranslateCodingSystem::Desc gbDesc[] = {
157 { CharsetRegistry::ISO646_C0, 0x0 },
158 { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
159 { CharsetRegistry::ISO6429, 0x80 },
160 { CharsetRegistry::GB2312, 0x8080 },
161 { CharsetRegistry::UNREGISTERED, 0x0 }
162 };
163
164 static const TranslateCodingSystem::Desc big5Desc[] = {
165 { CharsetRegistry::ISO646_C0, 0x0 },
166 { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
167 { CharsetRegistry::BIG5, 0x0 },
168 { CharsetRegistry::UNREGISTERED, 0x0 }
169 };
170
171 static const TranslateCodingSystem::Desc kscDesc[] = {
172 { CharsetRegistry::ISO646_C0, 0x0 },
173 { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
174 { CharsetRegistry::ISO6429, 0x80 },
175 { CharsetRegistry::KSC5601, 0x8080 },
176 { CharsetRegistry::UNREGISTERED, 0x0 }
177 };
178
179 static const TranslateCodingSystem::Desc iso8859_1Desc[] = {
180 { CharsetRegistry::ISO646_C0, 0x0 },
181 { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
182 { CharsetRegistry::ISO6429, 0x80 },
183 { CharsetRegistry::ISO8859_1, 0x80 },
184 { CharsetRegistry::UNREGISTERED, 0x0 }
185 };
186
187 static const TranslateCodingSystem::Desc iso8859_2Desc[] = {
188 { CharsetRegistry::ISO646_C0, 0x0 },
189 { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
190 { CharsetRegistry::ISO6429, 0x80 },
191 { CharsetRegistry::ISO8859_2, 0x80 },
192 { CharsetRegistry::UNREGISTERED, 0x0 }
193 };
194
195 static const TranslateCodingSystem::Desc iso8859_3Desc[] = {
196 { CharsetRegistry::ISO646_C0, 0x0 },
197 { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
198 { CharsetRegistry::ISO6429, 0x80 },
199 { CharsetRegistry::ISO8859_3, 0x80 },
200 { CharsetRegistry::UNREGISTERED, 0x0 }
201 };
202
203 static const TranslateCodingSystem::Desc iso8859_4Desc[] = {
204 { CharsetRegistry::ISO646_C0, 0x0 },
205 { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
206 { CharsetRegistry::ISO6429, 0x80 },
207 { CharsetRegistry::ISO8859_4, 0x80 },
208 { CharsetRegistry::UNREGISTERED, 0x0 }
209 };
210
211 static const TranslateCodingSystem::Desc iso8859_5Desc[] = {
212 { CharsetRegistry::ISO646_C0, 0x0 },
213 { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
214 { CharsetRegistry::ISO6429, 0x80 },
215 { CharsetRegistry::ISO8859_5, 0x80 },
216 { CharsetRegistry::UNREGISTERED, 0x0 }
217 };
218
219 static const TranslateCodingSystem::Desc iso8859_6Desc[] = {
220 { CharsetRegistry::ISO646_C0, 0x0 },
221 { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
222 { CharsetRegistry::ISO6429, 0x80 },
223 { CharsetRegistry::ISO8859_6, 0x80 },
224 { CharsetRegistry::UNREGISTERED, 0x0 }
225 };
226
227 static const TranslateCodingSystem::Desc iso8859_7Desc[] = {
228 { CharsetRegistry::ISO646_C0, 0x0 },
229 { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
230 { CharsetRegistry::ISO6429, 0x80 },
231 { CharsetRegistry::ISO8859_7, 0x80 },
232 { CharsetRegistry::UNREGISTERED, 0x0 }
233 };
234
235 static const TranslateCodingSystem::Desc iso8859_8Desc[] = {
236 { CharsetRegistry::ISO646_C0, 0x0 },
237 { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
238 { CharsetRegistry::ISO6429, 0x80 },
239 { CharsetRegistry::ISO8859_8, 0x80 },
240 { CharsetRegistry::UNREGISTERED, 0x0 }
241 };
242
243 static const TranslateCodingSystem::Desc iso8859_9Desc[] = {
244 { CharsetRegistry::ISO646_C0, 0x0 },
245 { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
246 { CharsetRegistry::ISO6429, 0x80 },
247 { CharsetRegistry::ISO8859_9, 0x80 },
248 { CharsetRegistry::UNREGISTERED, 0x0 }
249 };
250
251 static const TranslateCodingSystem::Desc koi8_rDesc[] = {
252 { CharsetRegistry::ISO646_C0, 0x0 },
253 { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
254 // FIXME: only GR part of KOI8-R is handled (i.e. 160..255)
255 // since koi8-r does not follow ISO control/graphic model
256 { CharsetRegistry::KOI8_R, 0x80 },
257 { CharsetRegistry::UNREGISTERED, 0x0 }
258 };
259
260 #endif /* SP_MULTI_BYTE */
261
262 const CodingSystemKitImpl::Entry CodingSystemKitImpl::bctfTable_[] = {
263 { "IDENTITY", identity },
264 #ifdef SP_MULTI_BYTE
265 { "FIXED-2", fixed2 },
266 { "UTF-8", utf8 },
267 { "EUC", eucBctf },
268 { "SJIS", sjisBctf },
269 { "BIG5", big5Bctf },
270 #endif /* SP_MULTI_BYTE */
271 { 0, identity },
272 };
273
274 const CodingSystemKitImpl::Entry CodingSystemKitImpl::encodingTable_[] = {
275 #ifdef SP_MULTI_BYTE
276 { "UTF-8", utf8 },
277 { "UCS-2", fixed2 },
278 { "ISO-10646-UCS-2", fixed2 },
279 { "UNICODE", unicode },
280 // We don't really support UTF-16, but treating it
281 // as Unicode should work for the most part.
282 { "UTF-16", unicode },
283 { "WINDOWS", ansi },
284 { "MS-DOS", oem },
285 { "WUNICODE", maybeUnicode },
286 { "XML", xml },
287 // nEncodingsRequireUnicode = 8
288 { "IS8859-1", iso8859_1 },
289 { "ISO-8859-1", iso8859_1 },
290 { "IS8859-2", iso8859_2 },
291 { "ISO-8859-2", iso8859_2 },
292 { "IS8859-3", iso8859_3 },
293 { "ISO-8859-3", iso8859_3 },
294 { "IS8859-4", iso8859_4 },
295 { "ISO-8859-4", iso8859_4 },
296 { "IS8859-5", iso8859_5 },
297 { "ISO-8859-5", iso8859_5 },
298 { "IS8859-6", iso8859_6 },
299 { "ISO-8859-6", iso8859_6 },
300 { "IS8859-7", iso8859_7 },
301 { "ISO-8859-7", iso8859_7 },
302 { "IS8859-8", iso8859_8 },
303 { "ISO-8859-8", iso8859_8 },
304 { "IS8859-9", iso8859_9 },
305 { "ISO-8859-9", iso8859_9 },
306 { "KOI8-R", koi8_r }, // RFC 1489
307 { "KOI8", koi8_r },
308 { "EUC-JP", eucjp },
309 { "EUC-CN", euccn },
310 { "GB2312", euccn },
311 { "CN-GB", euccn }, // RFC 1922
312 { "EUC-KR", euckr },
313 { "SJIS", sjis },
314 { "SHIFT_JIS", sjis },
315 { "BIG5", big5 },
316 { "CN-BIG5", big5 }, // RFC 1922
317 #endif /* SP_MULTI_BYTE */
318 { 0, identity },
319 };
320
CodingSystemKitImpl(const TranslateCodingSystem::Desc * systemCharsetDesc)321 CodingSystemKitImpl::CodingSystemKitImpl(const TranslateCodingSystem::Desc *systemCharsetDesc)
322 : systemCharsetDesc_(systemCharsetDesc)
323 #ifdef SP_MULTI_BYTE
324 ,
325 #ifdef WIN32
326 ansiCodingSystem_(Win32CodingSystem::codePageAnsi),
327 oemCodingSystem_(Win32CodingSystem::codePageOEM),
328 maybeUnicodeCodingSystem_(&ansiCodingSystem_),
329 #endif
330 xmlCodingSystem_(this),
331 iso8859_1CodingSystem_(&identityCodingSystem_, iso8859_1Desc, &systemCharset_, 0x100, unicodeReplaceChar),
332 iso8859_2CodingSystem_(&identityCodingSystem_, iso8859_2Desc, &systemCharset_, 0x100, unicodeReplaceChar),
333 iso8859_3CodingSystem_(&identityCodingSystem_, iso8859_3Desc, &systemCharset_, 0x100, unicodeReplaceChar),
334 iso8859_4CodingSystem_(&identityCodingSystem_, iso8859_4Desc, &systemCharset_, 0x100, unicodeReplaceChar),
335 iso8859_5CodingSystem_(&identityCodingSystem_, iso8859_5Desc, &systemCharset_, 0x100, unicodeReplaceChar),
336 iso8859_6CodingSystem_(&identityCodingSystem_, iso8859_6Desc, &systemCharset_, 0x100, unicodeReplaceChar),
337 iso8859_7CodingSystem_(&identityCodingSystem_, iso8859_7Desc, &systemCharset_, 0x100, unicodeReplaceChar),
338 iso8859_8CodingSystem_(&identityCodingSystem_, iso8859_8Desc, &systemCharset_, 0x100, unicodeReplaceChar),
339 iso8859_9CodingSystem_(&identityCodingSystem_, iso8859_9Desc, &systemCharset_, 0x100, unicodeReplaceChar),
340 koi8_rCodingSystem_(&identityCodingSystem_, koi8_rDesc, &systemCharset_, 0x100, unicodeReplaceChar),
341 eucjpCodingSystem_(&eucBctf_, jis2Desc, &systemCharset_, 0x8000, unicodeReplaceChar),
342 euccnCodingSystem_(&eucBctf_, gbDesc, &systemCharset_, 0x8000, unicodeReplaceChar),
343 euckrCodingSystem_(&eucBctf_, kscDesc, &systemCharset_, 0x8000, unicodeReplaceChar),
344 sjisCodingSystem_(&sjisBctf_, jisDesc, &systemCharset_, 0x8000, unicodeReplaceChar),
345 big5CodingSystem_(&big5Bctf_, big5Desc, &systemCharset_, 0x0080, unicodeReplaceChar)
346 #endif /* SP_MULTI_BYTE */
347 {
348 UnivCharsetDesc desc;
349 for (const TranslateCodingSystem::Desc *p = systemCharsetDesc_;
350 p->number != CharsetRegistry::UNREGISTERED;
351 p++) {
352 Owner<CharsetRegistry::Iter> iter(CharsetRegistry::makeIter(p->number));
353 if (iter) {
354 WideChar min;
355 WideChar max;
356 UnivChar univ;
357 while (iter->next(min, max, univ)) {
358 min += p->add;
359 max += p->add;
360 if (min <= charMax) {
361 if (max > charMax)
362 max = charMax;
363 desc.addRange(min, max, univ);
364 }
365 }
366 }
367 }
368 systemCharset_.set(desc);
369 }
370
copy() const371 CodingSystemKit *CodingSystemKitImpl::copy() const
372 {
373 return new CodingSystemKitImpl(systemCharsetDesc_);
374 }
375
firstEntry(Boolean isBctf) const376 const CodingSystemKitImpl::Entry *CodingSystemKitImpl::firstEntry(Boolean isBctf) const
377 {
378 if (isBctf)
379 return bctfTable_;
380 #ifdef SP_MULTI_BYTE
381 else if (systemCharsetDesc_ != iso10646Desc)
382 return encodingTable_ + nEncodingsRequireUnicode;
383 #endif
384 else
385 return encodingTable_;
386 }
387
388 const InputCodingSystem *
makeInputCodingSystem(const StringC & s,const CharsetInfo & charset,Boolean isBctf,const char * & key) const389 CodingSystemKitImpl::makeInputCodingSystem(const StringC &s,
390 const CharsetInfo &charset,
391 Boolean isBctf,
392 const char *&key) const
393 {
394 for (const Entry *p = firstEntry(isBctf); p->name; p++)
395 if (match(s, charset, p->name)) {
396 key = p->name;
397 return makeCodingSystem(p->id);
398 }
399 return 0;
400 }
401
402 Boolean
match(const StringC & s,const CharsetInfo & charset,const char * key)403 CodingSystemKitImpl::match(const StringC &s,
404 const CharsetInfo &charset,
405 const char *key)
406 {
407 for (size_t i = 0; i < s.size(); i++) {
408 if (key[i] == '\0')
409 return 0;
410 if (charset.execToDesc(toupper(key[i])) != s[i]
411 && charset.execToDesc(tolower(key[i])) != s[i])
412 return 0;
413 }
414 return key[s.size()] == '\0';
415 }
416
417 const CodingSystem *
makeCodingSystem(const char * s,Boolean isBctf) const418 CodingSystemKitImpl::makeCodingSystem(const char *s,
419 Boolean isBctf)
420 const
421 {
422 for (const Entry *p = firstEntry(isBctf); p->name; p++)
423 if (match(s, p->name))
424 return makeCodingSystem(p->id);
425 return 0;
426 }
427
428 Boolean
match(const char * s,const char * key)429 CodingSystemKitImpl::match(const char *s,
430 const char *key)
431 {
432 for (; toupper(*key) == *s || tolower(*key) == *s; s++, key++) {
433 if (*s == '\0')
434 return 1;
435 }
436 return 0;
437 }
438
439 const CodingSystem *
makeCodingSystem(CodingSystemId id) const440 CodingSystemKitImpl::makeCodingSystem(CodingSystemId id) const
441 {
442 switch (id) {
443 case identity:
444 return &identityCodingSystem_;
445 #ifdef SP_MULTI_BYTE
446 case fixed2:
447 return &fixed2CodingSystem_;
448 case utf8:
449 return &utf8CodingSystem_;
450 case unicode:
451 return &unicodeCodingSystem_;
452 case eucBctf:
453 return &eucBctf_;
454 case sjisBctf:
455 return &sjisBctf_;
456 case big5Bctf:
457 return &big5Bctf_;
458 case eucjp:
459 return &eucjpCodingSystem_;
460 case euccn:
461 return &euccnCodingSystem_;
462 case euckr:
463 return &euckrCodingSystem_;
464 case sjis:
465 return &sjisCodingSystem_;
466 case big5:
467 return &big5CodingSystem_;
468 case iso8859_1:
469 if (systemCharsetDesc_ == iso10646Desc)
470 return &identityCodingSystem_;
471 else
472 return &iso8859_1CodingSystem_;
473 case iso8859_2:
474 return &iso8859_2CodingSystem_;
475 case iso8859_3:
476 return &iso8859_3CodingSystem_;
477 case iso8859_4:
478 return &iso8859_4CodingSystem_;
479 case iso8859_5:
480 return &iso8859_5CodingSystem_;
481 case iso8859_6:
482 return &iso8859_6CodingSystem_;
483 case iso8859_7:
484 return &iso8859_7CodingSystem_;
485 case iso8859_8:
486 return &iso8859_8CodingSystem_;
487 case iso8859_9:
488 return &iso8859_9CodingSystem_;
489 case koi8_r:
490 return &koi8_rCodingSystem_;
491 case xml:
492 return &xmlCodingSystem_;
493 #ifdef WIN32
494 case ansi:
495 return &ansiCodingSystem_;
496 case oem:
497 return &oemCodingSystem_;
498 case maybeUnicode:
499 return &maybeUnicodeCodingSystem_;
500 #endif /* WIN32 */
501 #endif /* SP_MULTI_BYTE */
502 default:
503 break;
504 }
505 return 0;
506 }
507
508 const InputCodingSystem *
identityInputCodingSystem() const509 CodingSystemKitImpl::identityInputCodingSystem() const
510 {
511 return &identityCodingSystem_;
512 }
513
514 const CodingSystem *
identityCodingSystem() const515 CodingSystemKitImpl::identityCodingSystem() const
516 {
517 return &identityCodingSystem_;
518 }
519
replacementChar() const520 Char CodingSystemKitImpl::replacementChar() const
521 {
522 // FIXME should vary with systemCharset
523 #ifdef SP_MULTI_BYTE
524 return unicodeReplaceChar;
525 #else
526 return 0;
527 #endif
528 }
529
530 CodingSystemKit *
make(const char * systemCharsetName)531 CodingSystemKit::make(const char *systemCharsetName)
532 {
533 #ifdef SP_MULTI_BYTE
534 if (systemCharsetName && CodingSystemKitImpl::match(systemCharsetName, "JIS"))
535 return new CodingSystemKitImpl(jis2Desc);
536 #endif
537 return new CodingSystemKitImpl(iso10646Desc);
538 }
539
~InputCodingSystemKit()540 InputCodingSystemKit::~InputCodingSystemKit()
541 {
542 }
543
544 #ifdef SP_NAMESPACE
545 }
546 #endif
547