1 // Copyright (c) 1994, 1997 James Clark
2 // See the file COPYING for copying permission.
3 
4 #ifdef __GNUG__
5 #pragma implementation
6 #endif
7 #include "splib.h"
8 
9 #ifdef SP_MULTI_BYTE
10 
11 #include "XMLCodingSystem.h"
12 #include "UTF8CodingSystem.h"
13 #include "CodingSystemKit.h"
14 #include "Boolean.h"
15 #include "Owner.h"
16 #include "macros.h"
17 #include <stddef.h>
18 #include <string.h>
19 
20 #ifdef SP_DECLARE_MEMMOVE
21 extern "C" {
22   void *memmove(void *, const void *, size_t);
23 }
24 #endif
25 
26 #ifdef SP_NAMESPACE
27 namespace SP_NAMESPACE {
28 #endif
29 
30 const Char ISO646_TAB = 0x9;
31 const Char ISO646_LF = 0xA;
32 const Char ISO646_CR = 0xD;
33 const Char ISO646_SPACE = 0x20;
34 const Char ISO646_QUOT = 0x22;
35 const Char ISO646_APOS = 0x27;
36 const Char ISO646_LT = 0x3C;
37 const Char ISO646_EQUAL = 0x3D;
38 const Char ISO646_GT = 0x3E;
39 const Char ISO646_QUEST = 0x3F;
40 const Char ISO646_LETTER_a = 0x61;
41 const Char ISO646_LETTER_c = 0x63;
42 const Char ISO646_LETTER_d = 0x64;
43 const Char ISO646_LETTER_e = 0x65;
44 const Char ISO646_LETTER_g = 0x67;
45 const Char ISO646_LETTER_i = 0x69;
46 const Char ISO646_LETTER_l = 0x6C;
47 const Char ISO646_LETTER_m = 0x6D;
48 const Char ISO646_LETTER_n = 0x6E;
49 const Char ISO646_LETTER_o = 0x6F;
50 const Char ISO646_LETTER_x = 0x78;
51 
52 class XMLDecoder : public Decoder {
53 public:
54   XMLDecoder(const InputCodingSystemKit *);
55   size_t decode(Char *to, const char *from, size_t fromLen,
56 		const char **rest);
57   Boolean convertOffset(unsigned long &offset) const;
58 private:
59 
60   class UCS2 : public Decoder {
61   public:
62     UCS2(Boolean swapBytes);
63     size_t decode(Char *to, const char *from, size_t fromLen,
64 		  const char **rest);
65     Boolean convertOffset(unsigned long &offset) const;
66   private:
67     Boolean swapBytes_;
68   };
69   // Don't keep parsing a PI longer than this.
70   // We want to avoid reading some enormous file into memory just because
71   // some quote was left off.
72   enum { piMaxSize = 1024*32 };
73 
74   void initDecoderDefault();
75   void initDecoderPI();
76   Boolean extractEncoding(StringC &name);
77   static Boolean isWS(Char);
78 
79   enum DetectPhase {
80     phaseInit,
81     phasePI,
82     phaseFinish
83   };
84   DetectPhase phase_;
85   Boolean byteOrderMark_;
86   Boolean lsbFirst_;
87   int guessBytesPerChar_;
88   Owner<Decoder> subDecoder_;
89   // Contains all the characters passed to caller that were
90   // not produced by subDecoder_.
91   StringC pi_;
92   Char piLiteral_;
93   const InputCodingSystemKit *kit_;
94 };
95 
XMLCodingSystem(const InputCodingSystemKit * kit)96 XMLCodingSystem::XMLCodingSystem(const InputCodingSystemKit *kit)
97 : kit_(kit)
98 {
99 }
100 
makeDecoder() const101 Decoder *XMLCodingSystem::makeDecoder() const
102 {
103   return new XMLDecoder(kit_);
104 }
105 
makeEncoder() const106 Encoder *XMLCodingSystem::makeEncoder() const
107 {
108   UTF8CodingSystem utf8;
109   return utf8.makeEncoder();
110 }
111 
XMLDecoder(const InputCodingSystemKit * kit)112 XMLDecoder::XMLDecoder(const InputCodingSystemKit *kit)
113 : Decoder(1),
114   kit_(kit),
115   phase_(phaseInit),
116   byteOrderMark_(0),
117   lsbFirst_(0),
118   guessBytesPerChar_(1),
119   piLiteral_(0)
120 {
121 }
122 
decode(Char * to,const char * from,size_t fromLen,const char ** rest)123 size_t XMLDecoder::decode(Char *to, const char *from, size_t fromLen,
124 			  const char **rest)
125 {
126   if (phase_ == phaseFinish)
127     return subDecoder_->decode(to, from, fromLen, rest);
128   if (phase_ == phaseInit) {
129     if (fromLen == 0) {
130       *rest = from;
131       return 0;
132     }
133     switch ((unsigned char)*from) {
134     case 0x00:
135     case 0x3C:
136     case 0xFF:
137     case 0xFE:
138       if (fromLen < 2) {
139 	*rest = from;
140 	return 0;
141       }
142       switch (((unsigned char)from[0] << 8) | (unsigned char)from[1]) {
143       case 0xFEFF:
144 	phase_ = phasePI;
145 	byteOrderMark_ = 1;
146 	guessBytesPerChar_ = 2;
147 	from += 2;
148 	fromLen -= 2;
149 	break;
150       case 0xFFFE:
151 	lsbFirst_ = 1;
152 	phase_ = phasePI;
153 	byteOrderMark_ = 1;
154 	guessBytesPerChar_ = 2;
155 	from += 2;
156 	fromLen -= 2;
157 	break;
158       case 0x3C3F:
159 	phase_ = phasePI;
160 	break;
161       case 0x3C00:
162 	lsbFirst_ = 1;
163 	phase_ = phasePI;
164 	guessBytesPerChar_ = 2;
165 	break;
166       case 0x003C:
167 	phase_ = phasePI;
168 	guessBytesPerChar_ = 2;
169 	break;
170       default:
171 	break;
172       }
173       if (phase_ == phasePI)
174 	break;
175       // fall through
176     default:
177       phase_ = phaseFinish;
178       guessBytesPerChar_ = 1;
179       initDecoderDefault();
180       return subDecoder_->decode(to, from, fromLen, rest);
181     }
182   }
183   ASSERT(phase_ == phasePI);
184   Char *p = to;
185   for (; fromLen > guessBytesPerChar_;
186        fromLen -= guessBytesPerChar_, from += guessBytesPerChar_) {
187     if (!piLiteral_ && pi_.size() > 0 && pi_[pi_.size() - 1] == ISO646_GT) {
188       initDecoderPI();
189       phase_ = phaseFinish;
190       return (p - to) + subDecoder_->decode(p, from, fromLen, rest);
191     }
192     Char c = (unsigned char)from[0];
193     if (guessBytesPerChar_ > 1) {
194       if (lsbFirst_)
195 	c |= (unsigned char)from[1] << 8;
196       else {
197 	c <<= 8;
198 	c |= (unsigned char)from[1];
199       }
200     }
201     static const Char startBytes[] = {
202       ISO646_LT, ISO646_QUEST, ISO646_LETTER_x, ISO646_LETTER_m, ISO646_LETTER_l
203     };
204     // Stop accumulating the PI if we get characters that are illegal in the PI.
205     if (c == 0
206         || c >= 0x7F
207 	|| (pi_.size() > 0 && c == ISO646_LT)
208 	|| pi_.size() > piMaxSize
209 	|| (pi_.size() < 5 && c != startBytes[pi_.size()])
210 	|| (pi_.size() == 5 && !isWS(c))) {
211       initDecoderDefault();
212       phase_ = phaseFinish;
213       break;
214     }
215     *p++ = c;
216     pi_ += c;
217     if (piLiteral_) {
218       if (c == piLiteral_)
219 	piLiteral_ = 0;
220     }
221     else if (c == ISO646_QUOT || c == ISO646_APOS)
222       piLiteral_ = c;
223   }
224   size_t n = p - to;
225   if (phase_ == phaseFinish && fromLen > 0)
226     n += subDecoder_->decode(p, from, fromLen, rest);
227   else
228     *rest = from;
229   return n;
230 }
231 
convertOffset(unsigned long & n) const232 Boolean XMLDecoder::convertOffset(unsigned long &n) const
233 {
234   if (n <= pi_.size())
235     n *= guessBytesPerChar_;
236   else {
237     if (!subDecoder_)
238       return 0;
239     unsigned long tem = n - pi_.size();
240     if (!subDecoder_->convertOffset(tem))
241       return 0;
242     n = tem + pi_.size() * guessBytesPerChar_;
243   }
244   if (byteOrderMark_)
245     n += 2;
246   return 1;
247 }
248 
initDecoderDefault()249 void XMLDecoder::initDecoderDefault()
250 {
251   if (guessBytesPerChar_ == 1) {
252     UTF8CodingSystem utf8;
253     subDecoder_ = utf8.makeDecoder();
254   }
255   else {
256     unsigned short n = 0x1;
257     minBytesPerChar_ = 2;
258     subDecoder_ = new UCS2((*(char *)&n == 0x1) != lsbFirst_);
259   }
260 }
261 
initDecoderPI()262 void XMLDecoder::initDecoderPI()
263 {
264   StringC name;
265   if (!extractEncoding(name))
266     initDecoderDefault();
267   const char *dummy;
268   static const UnivCharsetDesc::Range range = { 0, 128, 0 };
269   CharsetInfo piCharset(UnivCharsetDesc(&range, 1));
270   const InputCodingSystem *ics
271     = kit_->makeInputCodingSystem(name,
272 				  piCharset,
273 				  0,
274 				  dummy);
275   if (ics) {
276     subDecoder_ = ics->makeDecoder();
277     minBytesPerChar_ = subDecoder_->minBytesPerChar();
278   }
279   if (!subDecoder_)
280     initDecoderDefault();
281 }
282 
isWS(Char c)283 Boolean XMLDecoder::isWS(Char c)
284 {
285   switch (c) {
286   case ISO646_CR:
287   case ISO646_LF:
288   case ISO646_SPACE:
289   case ISO646_TAB:
290     return 1;
291   }
292   return 0;
293 }
294 
extractEncoding(StringC & name)295 Boolean XMLDecoder::extractEncoding(StringC &name)
296 {
297   Char lit = 0;
298   for (size_t i = 5; i < pi_.size(); i++) {
299     if (!lit) {
300       if (pi_[i] == ISO646_APOS || pi_[i] == ISO646_QUOT)
301 	lit = pi_[i];
302       else if (pi_[i] == ISO646_EQUAL) {
303 	size_t j = i;
304 	for (; j > 0; j--) {
305 	  if (!isWS(pi_[j - 1]))
306 	    break;
307 	}
308 	size_t nameEnd = j;
309 	for (; j > 0; j--) {
310 	  if (isWS(pi_[j - 1]) || pi_[j - 1] == ISO646_QUOT || pi_[j - 1] == ISO646_APOS)
311 	    break;
312 	}
313 	static const Char encodingName[] = {
314 	  ISO646_LETTER_e, ISO646_LETTER_n, ISO646_LETTER_c, ISO646_LETTER_o,
315 	  ISO646_LETTER_d, ISO646_LETTER_i, ISO646_LETTER_n, ISO646_LETTER_g,
316 	  0
317 	};
318 	const Char *s = encodingName;
319 	for (; *s && j < nameEnd; j++, s++)
320 	  if (pi_[j] != *s)
321 	    break;
322 	if (j == nameEnd && *s == 0) {
323 	  size_t j = i + 1;
324 	  for (; j < pi_.size(); j++) {
325 	    if (!isWS(pi_[j]))
326 	      break;
327 	  }
328 	  if (pi_[j] == ISO646_QUOT || pi_[j] == ISO646_APOS) {
329 	    Char lit = pi_[j];
330 	    size_t nameStart = j + 1;
331 	    for (++j; j < pi_.size(); j++) {
332 	      if (pi_[j] == lit) {
333 		if (j > nameStart) {
334 		  name.assign(&pi_[nameStart], j - nameStart);
335 		  return 1;
336 		}
337 		break;
338 	      }
339 	    }
340 	  }
341 	  return 0;
342 	}
343       }
344     }
345     else if (pi_[i] == lit)
346       lit = 0;
347   }
348   return 0;
349 }
350 
UCS2(Boolean swapBytes)351 XMLDecoder::UCS2::UCS2(Boolean swapBytes)
352 : swapBytes_(swapBytes)
353 {
354 }
355 
decode(Char * to,const char * from,size_t fromLen,const char ** rest)356 size_t XMLDecoder::UCS2::decode(Char *to, const char *from, size_t fromLen,
357 				const char **rest)
358 {
359   union U {
360     unsigned short word;
361     char bytes[2];
362   };
363   fromLen &= ~1;
364   *rest = from + fromLen;
365   if (sizeof(Char) == 2) {
366     if (!swapBytes_) {
367       if (from != (char *)to)
368 	memmove(to, from, fromLen);
369       return fromLen/2;
370     }
371   }
372   if (swapBytes_) {
373     for (size_t n = fromLen; n > 0; n -= 2) {
374       U u;
375       u.bytes[1] = *from++;
376       u.bytes[0] = *from++;
377       *to++ = u.word;
378     }
379   }
380   else  {
381     for (size_t n = fromLen; n > 0; n -= 2) {
382       U u;
383       u.bytes[0] = *from++;
384       u.bytes[1] = *from++;
385       *to++ = u.word;
386     }
387   }
388   return fromLen/2;
389 }
390 
convertOffset(unsigned long & n) const391 Boolean XMLDecoder::UCS2::convertOffset(unsigned long &n) const
392 {
393   n *= 2;
394   return 1;
395 }
396 
397 #ifdef SP_NAMESPACE
398 }
399 #endif
400 
401 #else /* not SP_MULTI_BYTE */
402 
403 #ifndef __GNUG__
404 static char non_empty_translation_unit;	// sigh
405 #endif
406 
407 #endif /* not SP_MULTI_BYTE */
408