1 // Copyright (c) 1994, 1997 James Clark
2 // See the file COPYING for copying permission.
3
4 #ifdef __GNUG__
5 #pragma implementation
6 #endif
7 #include "splib.h"
8
9 #ifdef SP_MULTI_BYTE
10
11 #include "XMLCodingSystem.h"
12 #include "UTF8CodingSystem.h"
13 #include "CodingSystemKit.h"
14 #include "Boolean.h"
15 #include "Owner.h"
16 #include "macros.h"
17 #include <stddef.h>
18 #include <string.h>
19
20 #ifdef SP_DECLARE_MEMMOVE
21 extern "C" {
22 void *memmove(void *, const void *, size_t);
23 }
24 #endif
25
26 #ifdef SP_NAMESPACE
27 namespace SP_NAMESPACE {
28 #endif
29
30 const Char ISO646_TAB = 0x9;
31 const Char ISO646_LF = 0xA;
32 const Char ISO646_CR = 0xD;
33 const Char ISO646_SPACE = 0x20;
34 const Char ISO646_QUOT = 0x22;
35 const Char ISO646_APOS = 0x27;
36 const Char ISO646_LT = 0x3C;
37 const Char ISO646_EQUAL = 0x3D;
38 const Char ISO646_GT = 0x3E;
39 const Char ISO646_QUEST = 0x3F;
40 const Char ISO646_LETTER_a = 0x61;
41 const Char ISO646_LETTER_c = 0x63;
42 const Char ISO646_LETTER_d = 0x64;
43 const Char ISO646_LETTER_e = 0x65;
44 const Char ISO646_LETTER_g = 0x67;
45 const Char ISO646_LETTER_i = 0x69;
46 const Char ISO646_LETTER_l = 0x6C;
47 const Char ISO646_LETTER_m = 0x6D;
48 const Char ISO646_LETTER_n = 0x6E;
49 const Char ISO646_LETTER_o = 0x6F;
50 const Char ISO646_LETTER_x = 0x78;
51
52 class XMLDecoder : public Decoder {
53 public:
54 XMLDecoder(const InputCodingSystemKit *);
55 size_t decode(Char *to, const char *from, size_t fromLen,
56 const char **rest);
57 Boolean convertOffset(unsigned long &offset) const;
58 private:
59
60 class UCS2 : public Decoder {
61 public:
62 UCS2(Boolean swapBytes);
63 size_t decode(Char *to, const char *from, size_t fromLen,
64 const char **rest);
65 Boolean convertOffset(unsigned long &offset) const;
66 private:
67 Boolean swapBytes_;
68 };
69 // Don't keep parsing a PI longer than this.
70 // We want to avoid reading some enormous file into memory just because
71 // some quote was left off.
72 enum { piMaxSize = 1024*32 };
73
74 void initDecoderDefault();
75 void initDecoderPI();
76 Boolean extractEncoding(StringC &name);
77 static Boolean isWS(Char);
78
79 enum DetectPhase {
80 phaseInit,
81 phasePI,
82 phaseFinish
83 };
84 DetectPhase phase_;
85 Boolean byteOrderMark_;
86 Boolean lsbFirst_;
87 int guessBytesPerChar_;
88 Owner<Decoder> subDecoder_;
89 // Contains all the characters passed to caller that were
90 // not produced by subDecoder_.
91 StringC pi_;
92 Char piLiteral_;
93 const InputCodingSystemKit *kit_;
94 };
95
XMLCodingSystem(const InputCodingSystemKit * kit)96 XMLCodingSystem::XMLCodingSystem(const InputCodingSystemKit *kit)
97 : kit_(kit)
98 {
99 }
100
makeDecoder() const101 Decoder *XMLCodingSystem::makeDecoder() const
102 {
103 return new XMLDecoder(kit_);
104 }
105
makeEncoder() const106 Encoder *XMLCodingSystem::makeEncoder() const
107 {
108 UTF8CodingSystem utf8;
109 return utf8.makeEncoder();
110 }
111
XMLDecoder(const InputCodingSystemKit * kit)112 XMLDecoder::XMLDecoder(const InputCodingSystemKit *kit)
113 : Decoder(1),
114 kit_(kit),
115 phase_(phaseInit),
116 byteOrderMark_(0),
117 lsbFirst_(0),
118 guessBytesPerChar_(1),
119 piLiteral_(0)
120 {
121 }
122
decode(Char * to,const char * from,size_t fromLen,const char ** rest)123 size_t XMLDecoder::decode(Char *to, const char *from, size_t fromLen,
124 const char **rest)
125 {
126 if (phase_ == phaseFinish)
127 return subDecoder_->decode(to, from, fromLen, rest);
128 if (phase_ == phaseInit) {
129 if (fromLen == 0) {
130 *rest = from;
131 return 0;
132 }
133 switch ((unsigned char)*from) {
134 case 0x00:
135 case 0x3C:
136 case 0xFF:
137 case 0xFE:
138 if (fromLen < 2) {
139 *rest = from;
140 return 0;
141 }
142 switch (((unsigned char)from[0] << 8) | (unsigned char)from[1]) {
143 case 0xFEFF:
144 phase_ = phasePI;
145 byteOrderMark_ = 1;
146 guessBytesPerChar_ = 2;
147 from += 2;
148 fromLen -= 2;
149 break;
150 case 0xFFFE:
151 lsbFirst_ = 1;
152 phase_ = phasePI;
153 byteOrderMark_ = 1;
154 guessBytesPerChar_ = 2;
155 from += 2;
156 fromLen -= 2;
157 break;
158 case 0x3C3F:
159 phase_ = phasePI;
160 break;
161 case 0x3C00:
162 lsbFirst_ = 1;
163 phase_ = phasePI;
164 guessBytesPerChar_ = 2;
165 break;
166 case 0x003C:
167 phase_ = phasePI;
168 guessBytesPerChar_ = 2;
169 break;
170 default:
171 break;
172 }
173 if (phase_ == phasePI)
174 break;
175 // fall through
176 default:
177 phase_ = phaseFinish;
178 guessBytesPerChar_ = 1;
179 initDecoderDefault();
180 return subDecoder_->decode(to, from, fromLen, rest);
181 }
182 }
183 ASSERT(phase_ == phasePI);
184 Char *p = to;
185 for (; fromLen > guessBytesPerChar_;
186 fromLen -= guessBytesPerChar_, from += guessBytesPerChar_) {
187 if (!piLiteral_ && pi_.size() > 0 && pi_[pi_.size() - 1] == ISO646_GT) {
188 initDecoderPI();
189 phase_ = phaseFinish;
190 return (p - to) + subDecoder_->decode(p, from, fromLen, rest);
191 }
192 Char c = (unsigned char)from[0];
193 if (guessBytesPerChar_ > 1) {
194 if (lsbFirst_)
195 c |= (unsigned char)from[1] << 8;
196 else {
197 c <<= 8;
198 c |= (unsigned char)from[1];
199 }
200 }
201 static const Char startBytes[] = {
202 ISO646_LT, ISO646_QUEST, ISO646_LETTER_x, ISO646_LETTER_m, ISO646_LETTER_l
203 };
204 // Stop accumulating the PI if we get characters that are illegal in the PI.
205 if (c == 0
206 || c >= 0x7F
207 || (pi_.size() > 0 && c == ISO646_LT)
208 || pi_.size() > piMaxSize
209 || (pi_.size() < 5 && c != startBytes[pi_.size()])
210 || (pi_.size() == 5 && !isWS(c))) {
211 initDecoderDefault();
212 phase_ = phaseFinish;
213 break;
214 }
215 *p++ = c;
216 pi_ += c;
217 if (piLiteral_) {
218 if (c == piLiteral_)
219 piLiteral_ = 0;
220 }
221 else if (c == ISO646_QUOT || c == ISO646_APOS)
222 piLiteral_ = c;
223 }
224 size_t n = p - to;
225 if (phase_ == phaseFinish && fromLen > 0)
226 n += subDecoder_->decode(p, from, fromLen, rest);
227 else
228 *rest = from;
229 return n;
230 }
231
convertOffset(unsigned long & n) const232 Boolean XMLDecoder::convertOffset(unsigned long &n) const
233 {
234 if (n <= pi_.size())
235 n *= guessBytesPerChar_;
236 else {
237 if (!subDecoder_)
238 return 0;
239 unsigned long tem = n - pi_.size();
240 if (!subDecoder_->convertOffset(tem))
241 return 0;
242 n = tem + pi_.size() * guessBytesPerChar_;
243 }
244 if (byteOrderMark_)
245 n += 2;
246 return 1;
247 }
248
initDecoderDefault()249 void XMLDecoder::initDecoderDefault()
250 {
251 if (guessBytesPerChar_ == 1) {
252 UTF8CodingSystem utf8;
253 subDecoder_ = utf8.makeDecoder();
254 }
255 else {
256 unsigned short n = 0x1;
257 minBytesPerChar_ = 2;
258 subDecoder_ = new UCS2((*(char *)&n == 0x1) != lsbFirst_);
259 }
260 }
261
initDecoderPI()262 void XMLDecoder::initDecoderPI()
263 {
264 StringC name;
265 if (!extractEncoding(name))
266 initDecoderDefault();
267 const char *dummy;
268 static const UnivCharsetDesc::Range range = { 0, 128, 0 };
269 CharsetInfo piCharset(UnivCharsetDesc(&range, 1));
270 const InputCodingSystem *ics
271 = kit_->makeInputCodingSystem(name,
272 piCharset,
273 0,
274 dummy);
275 if (ics) {
276 subDecoder_ = ics->makeDecoder();
277 minBytesPerChar_ = subDecoder_->minBytesPerChar();
278 }
279 if (!subDecoder_)
280 initDecoderDefault();
281 }
282
isWS(Char c)283 Boolean XMLDecoder::isWS(Char c)
284 {
285 switch (c) {
286 case ISO646_CR:
287 case ISO646_LF:
288 case ISO646_SPACE:
289 case ISO646_TAB:
290 return 1;
291 }
292 return 0;
293 }
294
extractEncoding(StringC & name)295 Boolean XMLDecoder::extractEncoding(StringC &name)
296 {
297 Char lit = 0;
298 for (size_t i = 5; i < pi_.size(); i++) {
299 if (!lit) {
300 if (pi_[i] == ISO646_APOS || pi_[i] == ISO646_QUOT)
301 lit = pi_[i];
302 else if (pi_[i] == ISO646_EQUAL) {
303 size_t j = i;
304 for (; j > 0; j--) {
305 if (!isWS(pi_[j - 1]))
306 break;
307 }
308 size_t nameEnd = j;
309 for (; j > 0; j--) {
310 if (isWS(pi_[j - 1]) || pi_[j - 1] == ISO646_QUOT || pi_[j - 1] == ISO646_APOS)
311 break;
312 }
313 static const Char encodingName[] = {
314 ISO646_LETTER_e, ISO646_LETTER_n, ISO646_LETTER_c, ISO646_LETTER_o,
315 ISO646_LETTER_d, ISO646_LETTER_i, ISO646_LETTER_n, ISO646_LETTER_g,
316 0
317 };
318 const Char *s = encodingName;
319 for (; *s && j < nameEnd; j++, s++)
320 if (pi_[j] != *s)
321 break;
322 if (j == nameEnd && *s == 0) {
323 size_t j = i + 1;
324 for (; j < pi_.size(); j++) {
325 if (!isWS(pi_[j]))
326 break;
327 }
328 if (pi_[j] == ISO646_QUOT || pi_[j] == ISO646_APOS) {
329 Char lit = pi_[j];
330 size_t nameStart = j + 1;
331 for (++j; j < pi_.size(); j++) {
332 if (pi_[j] == lit) {
333 if (j > nameStart) {
334 name.assign(&pi_[nameStart], j - nameStart);
335 return 1;
336 }
337 break;
338 }
339 }
340 }
341 return 0;
342 }
343 }
344 }
345 else if (pi_[i] == lit)
346 lit = 0;
347 }
348 return 0;
349 }
350
UCS2(Boolean swapBytes)351 XMLDecoder::UCS2::UCS2(Boolean swapBytes)
352 : swapBytes_(swapBytes)
353 {
354 }
355
decode(Char * to,const char * from,size_t fromLen,const char ** rest)356 size_t XMLDecoder::UCS2::decode(Char *to, const char *from, size_t fromLen,
357 const char **rest)
358 {
359 union U {
360 unsigned short word;
361 char bytes[2];
362 };
363 fromLen &= ~1;
364 *rest = from + fromLen;
365 if (sizeof(Char) == 2) {
366 if (!swapBytes_) {
367 if (from != (char *)to)
368 memmove(to, from, fromLen);
369 return fromLen/2;
370 }
371 }
372 if (swapBytes_) {
373 for (size_t n = fromLen; n > 0; n -= 2) {
374 U u;
375 u.bytes[1] = *from++;
376 u.bytes[0] = *from++;
377 *to++ = u.word;
378 }
379 }
380 else {
381 for (size_t n = fromLen; n > 0; n -= 2) {
382 U u;
383 u.bytes[0] = *from++;
384 u.bytes[1] = *from++;
385 *to++ = u.word;
386 }
387 }
388 return fromLen/2;
389 }
390
convertOffset(unsigned long & n) const391 Boolean XMLDecoder::UCS2::convertOffset(unsigned long &n) const
392 {
393 n *= 2;
394 return 1;
395 }
396
397 #ifdef SP_NAMESPACE
398 }
399 #endif
400
401 #else /* not SP_MULTI_BYTE */
402
403 #ifndef __GNUG__
404 static char non_empty_translation_unit; // sigh
405 #endif
406
407 #endif /* not SP_MULTI_BYTE */
408