1 // Copyright 2009 The Archiveopteryx Developers <info@aox.org>
2
3 #include "utf.h"
4
5 #include "estring.h"
6 #include "ustring.h"
7
8
9 /*! \class Utf8Codec utf.h
10 The Utf8Codec class implements the codec described in RFC 2279
11
12 This is also the same as in the Unicode book, but this
13 implementation follows RFC 2279.
14
15 Overlong forms (e.g. 0xC0 Ox80 for U+0000) are allowed by the
16 decoder, but considered badly formed.
17 */
18
19 /*! Constructs a simple UTF8 decoder/encoder. */
20
Utf8Codec()21 Utf8Codec::Utf8Codec()
22 : Codec( "UTF-8" ), pgutf( false )
23 {
24 }
25
26
27 // from RFC 2279:
28
29 // UCS-4 range (hex.) UTF-8 octet sequence (binary)
30 // 0000 0000-0000 007F 0xxxxxxx
31 // 0000 0080-0000 07FF 110xxxxx 10xxxxxx
32 // 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
33 // 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
34 // 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
35 // 0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx
36
37
38
fromUnicode(const UString & u)39 EString Utf8Codec::fromUnicode( const UString & u )
40 {
41 EString r;
42 r.reserve( u.length() + 40 );
43 uint i = 0;
44 while ( i < u.length() ) {
45 int c = u[i];
46 if ( pgutf && !c ) {
47 // append U+ED00 since postgres cannot store 0 bytes
48 r.append( 0xEE );
49 r.append( 0xB4 );
50 r.append( 0x80 );
51 }
52 else if ( c < 0x80 ) {
53 r.append( (char)c );
54 }
55 else if ( c < 0x800 ) {
56 r.append( 0xc0 | ((char)(c >> 6)) );
57 r.append( 0x80 | ((char)(c & 0x3f)) );
58 }
59 else if ( c < 0x10000 ) {
60 r.append( 0xe0 | ((char)(c >> 12)) );
61 r.append( 0x80 | ((char)(c >> 6) & 0x3f) );
62 r.append( 0x80 | ((char)(c & 0x3f)) );
63 }
64 else if ( c < 0x200000 ) {
65 r.append( 0xf0 | ((char)(c >> 18)) );
66 r.append( 0x80 | ((char)(c >> 12) & 0x3f) );
67 r.append( 0x80 | ((char)(c >> 6) & 0x3f) );
68 r.append( 0x80 | ((char)(c & 0x3f)) );
69 }
70 else if ( c < 0x4000000 ) {
71 r.append( 0xf8 | ((char)(c >> 24)) );
72 r.append( 0x80 | ((char)(c >> 18) & 0x3f) );
73 r.append( 0x80 | ((char)(c >> 12) & 0x3f) );
74 r.append( 0x80 | ((char)(c >> 6) & 0x3f) );
75 r.append( 0x80 | ((char)(c & 0x3f)) );
76 }
77 else if ( c > 0 ) {
78 r.append( 0xfc | ((char)(c >> 30)) );
79 r.append( 0x80 | ((char)(c >> 24) & 0x3f) );
80 r.append( 0x80 | ((char)(c >> 18) & 0x3f) );
81 r.append( 0x80 | ((char)(c >> 12) & 0x3f) );
82 r.append( 0x80 | ((char)(c >> 6) & 0x3f) );
83 r.append( 0x80 | ((char)(c & 0x3f)) );
84 }
85 i++;
86 }
87 return r;
88 }
89
90
ahead(const EString & s,int i,uint l)91 static bool ahead( const EString & s, int i, uint l )
92 {
93 int j = i+1;
94 while ( l > 0 ) {
95 if ( (s[j] & 0xc0) != 0x80 )
96 return false;
97 j++;
98 l--;
99 }
100 return true;
101 }
102
103
pick(const EString & s,int i,uint l)104 static int pick( const EString & s, int i, uint l )
105 {
106 int a = 0;
107 while ( l > 0 ) {
108 i++;
109 a = (a << 6) | (s[i] & 0x3f);
110 l--;
111 }
112 return a;
113 }
114
115 /*! Decodes the UTF-8 string \a s and returns the result. */
116
toUnicode(const EString & s)117 UString Utf8Codec::toUnicode( const EString & s )
118 {
119 UString u;
120 u.reserve( s.length() );
121 uint i = 0;
122 while ( i < s.length() ) {
123 int c = 0;
124 if ( s[i] < 0x80 ) {
125 // 0000 0000-0000 007F 0xxxxxxx
126 c = s[i];
127 i += 1;
128 }
129 else if ( (s[i] & 0xe0) == 0xc0 && ahead( s, i, 1 ) ) {
130 // 0000 0080-0000 07FF 110xxxxx 10xxxxxx
131 c = ((s[i] & 0x1f) << 6) | pick( s, i, 1 );
132 if ( c < 0x80 )
133 setState( BadlyFormed );
134 i += 2;
135 }
136 else if ( (s[i] & 0xf0) == 0xe0 && ahead( s, i, 2 ) ) {
137 // 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
138 c = ((s[i] & 0x0f) << 12) | pick( s, i, 2 );
139 if ( c < 0x800 )
140 setState( BadlyFormed );
141 if ( c == 0xED00 && pgutf )
142 c = 0;
143 i += 3;
144 }
145 else if ( (s[i] & 0xf8) == 0xf0 && ahead( s, i, 3 ) ) {
146 // 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
147 c = ((s[i] & 0x07) << 18) | pick( s, i, 3 );
148 if ( c < 0x10000 )
149 setState( BadlyFormed );
150 i += 4;
151 }
152 else if ( (s[i] & 0xfc) == 0xf8 && ahead( s, i, 4 ) ) {
153 // 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx ... 10xxxxxx
154 c = ((s[i] & 0x03) << 24) | pick( s, i, 4 );
155 if ( c < 0x200000 )
156 setState( BadlyFormed );
157 i += 5;
158 }
159 else if ( (s[i] & 0xfe) == 0xfc && ahead( s, i, 5 ) ) {
160 // 0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx
161 c = ((s[i] & 0x01) << 30) | pick( s, i, 5 );
162 if ( c < 0x4000000 )
163 setState( BadlyFormed );
164 i += 6;
165 }
166 else {
167 recordError( i, s );
168 c = 0xFFFD;
169 i++;
170 }
171 append( u, c );
172 }
173 mangleTrailingSurrogate( u );
174 return u;
175 }
176
177
178 /*! \class PgUtf8Codec utf.h
179 The PgUtf8Codec is a simple modification of Utf8Codec to be able
180 to use PostgreSQL 8.1 well.
181
182 PostgreSQL 8.1 refuses to store the unicode codepoint 0. The
183 software reports that it is an invalid byte sequence and refers to
184 htt://www.postgresql.org/docs/techdocs.50, but the real reason is
185 that postgresql never was intended to store nulls in text, and
186 versions up to 8.0 allowed it only by accident.
187
188 Since quite a few messages contain null bytes, we remap 0 to
189 U+ED00 (a private-use codepoint, also used by Unknown8BitCodec)
190 and back.
191
192 This class is not listed as a supported codec, since it's meant
193 only for postgres use, not for any other purpose.
194 */
195
196
197 /*! Constructs an empty PgUtf8Codec. */
198
PgUtf8Codec()199 PgUtf8Codec::PgUtf8Codec()
200 : Utf8Codec()
201 {
202 pgutf = true;
203 }
204
205
206
207
208 /*! \class Utf16Codec utf.h
209 The Utf16Codec implements UTF-16 as specified in RFC 2781.
210
211 For decoding, Utf16Codec autodetects UTF-16BE or -LE based on the
212 BOM, and for encoding it uses UTF-16BE with a BOM until/unless
213 decoding autodetects UTF-16LE or UTF-16BE without a BOM. In
214 practice it always uses UTF-16BE with a BOM.
215 */
216
217
218 /*! Constructs a simple UTF-16 encoder/decoder. For decoding, the
219 backend is autoselected.
220 */
221
Utf16Codec()222 Utf16Codec::Utf16Codec()
223 : Codec( "UTF-16" ), be( true ), bom( true )
224 {
225 // nothing
226 }
227
228
fromUnicode(const UString & u)229 EString Utf16Codec::fromUnicode( const UString & u )
230 {
231 EString r;
232
233 if ( !bom ) {
234 // if we don't output a BOM, reader should assume BE, so we
235 // must be BE to conform
236 be = true;
237 }
238 else if ( be ) {
239 r.append( 0xfe );
240 r.append( 0xff );
241 }
242 else {
243 r.append( 0xfe );
244 r.append( 0xff );
245 }
246
247 if ( be )
248 r.append( (new Utf16BeCodec)->fromUnicode( u ) );
249 else
250 r.append( (new Utf16LeCodec)->fromUnicode( u ) );
251
252 return r;
253 }
254
255
toUnicode(const EString & s)256 UString Utf16Codec::toUnicode( const EString & s )
257 {
258 if ( s[0] == 0xFF && s[1] == 0xFE ) {
259 be = false;
260 bom = true;
261 }
262 else if ( s[0] == 0xFE && s[1] == 0xFF ) {
263 be = true;
264 bom = true;
265 }
266 else {
267 be = true;
268 bom = false;
269 }
270
271 Codec * c = 0;
272 if ( be )
273 c = new Utf16BeCodec;
274 else
275 c = new Utf16LeCodec;
276 UString r = c->toUnicode( s );
277
278 setState( c->state() );
279 if ( c->state() == Invalid )
280 recordError( c->error() );
281 return r;
282 }
283
284
285 /*! \class Utf16LeCodec utf.h
286 The Utf16LeCodec implements UTF-16LE as specified in RFC 2781.
287
288 Utf16LeCodec removes a BOM while decoding and does not add one
289 while encoding.
290 */
291
292
293 /*! Constructs a simple UTF-16LE encoder/decoder.
294 */
295
296
Utf16LeCodec()297 Utf16LeCodec::Utf16LeCodec()
298 : Codec( "UTF-16LE" )
299 {
300 // nothing
301 }
302
303
fromUnicode(const UString & u)304 EString Utf16LeCodec::fromUnicode( const UString & u )
305 {
306 EString r;
307 r.reserve( u.length() * 2 );
308 uint i = 0;
309 while ( i < u.length() ) {
310 r.append( u[i] % 0x100 );
311 r.append( u[i] / 0x100 );
312 i++;
313 }
314 return r;
315 }
316
317
318 /*! toUnicode() is probably a little lax. No. It IS a little lax. We
319 may tighten this later. At least, we can check that \a s has an
320 even length.
321 */
322
toUnicode(const EString & s)323 UString Utf16LeCodec::toUnicode( const EString & s )
324 {
325 UString u;
326 u.reserve( s.length() / 2 );
327 uint i = 0;
328 while ( i < s.length() ) {
329 uint c = s[i] + 0x100 * s[i+1];
330 if ( !u.isEmpty() || c != 0xFEFF )
331 append( u, c );
332 i += 2;
333 }
334 mangleTrailingSurrogate( u );
335 return u;
336 }
337
338
339 /*! \class Utf16BeCodec utf.h
340 The Utf16BeCodec implements UTF-16BE as specified in RFC 2781.
341
342 Utf16BeCodec removes a BOM while decoding and does not add one
343 while encoding.
344 */
345
346
347 /*! Constructs a simple UTF-16BE encoder/decoder.
348 */
349
350
Utf16BeCodec()351 Utf16BeCodec::Utf16BeCodec()
352 : Codec( "UTF-16BE" )
353 {
354 // nothing
355 }
356
357
fromUnicode(const UString & u)358 EString Utf16BeCodec::fromUnicode( const UString & u )
359 {
360 EString r;
361 r.reserve( u.length() * 2 );
362 uint i = 0;
363 while ( i < u.length() ) {
364 r.append( u[i] / 0x100 );
365 r.append( u[i] % 0x100 );
366 i++;
367 }
368 return r;
369 }
370
371
372 /*! toUnicode() is probably a little lax. No. It IS a little lax. We
373 may tighten this later. At least, we can check that \a s has an
374 even length.
375 */
376
toUnicode(const EString & s)377 UString Utf16BeCodec::toUnicode( const EString & s )
378 {
379 UString u;
380 u.reserve( s.length() / 2 );
381 uint i = 0;
382 while ( i < s.length() ) {
383 uint c = s[i] * 0x100 + s[i+1];
384 if ( !u.isEmpty() || c != 0xFEFF )
385 append( u, c );
386 i += 2;
387 }
388 mangleTrailingSurrogate( u );
389 return u;
390 }
391
392
393 /*! \class Utf7Codec utf.h
394
395 The Utf7Codec class provides conversion to and from the UTF-7
396 encoding specified in RFC 2152. It's almost entirely unused,
397 except that some IMAP clients use its mUTF7 variation. It is
398 implemented here so that we can more easily implement mUTF7.
399 */
400
401
402 /*! Constructs a plain UTF-7 decoder/encoder.
403
404 */
405
Utf7Codec()406 Utf7Codec::Utf7Codec()
407 : Codec( "UTF-7" ), broken( false )
408 {
409 }
410
411
412 /*! This private helper returns the "correct" base64 encoding of \a u,
413 including the special case for "+".
414 */
415
e(const UString & u)416 EString Utf7Codec::e( const UString & u )
417 {
418 if ( u.length() == 1 &&
419 u[0] == ( broken ? '&' : '+' ) )
420 return "";
421
422 EString t;
423 uint i = 0;
424 while ( i < u.length() ) {
425 uint c = u[i];
426 t.append( c / 256 );
427 t.append( c % 256 );
428 i++;
429 }
430 EString e = t.e64().mid( 0, ( i * 16 + 5 ) / 6 );
431 if ( !broken )
432 return e;
433 EString b;
434 i = 0;
435 while ( i < e.length() ) {
436 if ( e[i] == '/' )
437 b.append( ',' );
438 else
439 b.append( e[i] );
440 ++i;
441 }
442 return b;
443 }
444
445
fromUnicode(const UString & u)446 EString Utf7Codec::fromUnicode( const UString & u )
447 {
448 UString u16;
449 uint i = 0;
450 while ( i < u.length() ) {
451 if ( u[i] < 0x10000 ) {
452 u16.append( u[i] );
453 }
454 else {
455 u16.append( 0xD800 + ( ( u[i] - 0x10000 ) >> 10 ) );
456 u16.append( 0xDC00 + ( ( u[i] - 0x10000 ) & 0x3ff ) );
457 }
458 i++;
459 }
460 i = 0;
461 EString r;
462 uint b = UINT_MAX;
463 while ( i < u16.length() ) {
464 uint c = u16[i];
465 if ( c < 128 &&
466 ( ( c >= 'A' && c <= 'Z' ) ||
467 ( c >= 'a' && c <= 'z' ) ||
468 ( c >= '0' && c <= '9' ) ||
469 // Set D (directly encoded characters) consists of the following
470 // characters (derived from RFC 1521, Appendix B, which no longer
471 // appears in RFC 2045): the upper and lower case letters A through Z
472 // and a through z, the 10 digits 0-9, and the following nine special
473 // characters (note that "+" and "=" are omitted):
474 c == '\'' || c == '(' || c == ')' || c == ',' ||
475 c == '-' || c == '.' || c == '/' || c == ':' ||
476 c == '?' ||
477 // Rule 3: The space (decimal 32), tab (decimal 9), carriage return
478 // (decimal 13), and line feed (decimal 10) characters may be
479 // directly represented by their ASCII equivalents.
480 c == ' ' || c == 9 || c == 13 ||
481 // Set O (optional direct characters) consists of the following
482 // characters (note that "\" and "~" are omitted):
483 c == '!' || c == '"' || c == '#' || c == '$' ||
484 c == '%' || c == '*' || c == ';' ||
485 c == '<' || c == '=' || c == '>' || c == '@' ||
486 c == '[' || c == ']' || c == '^' || c == '_' ||
487 c == '`' || c == '{' || c == '|' || c == '}' ||
488 // MUTF-7 removes & from set O, and adds +
489 c == ( broken ? '+' : '&' ) ) ) {
490 if ( b < i ) {
491 r.append( e( u16.mid( b, i - b ) ) );
492 b = UINT_MAX;
493 if ( ( c >= 'A' && c <= 'Z' ) ||
494 ( c >= 'a' && c <= 'z' ) ||
495 ( c >= '0' && c <= '9' ) ||
496 c == '/' || c == '+' || c == '-' ||
497 broken )
498 r.append( "-" );
499 }
500 r.append( c );
501 }
502 else {
503 if ( b > i ) {
504 if ( broken )
505 r.append( '&' );
506 else
507 r.append( '+' );
508 b = i;
509 }
510 }
511 ++i;
512 }
513 if ( b < i ) {
514 r.append( e( u16.mid( b ) ) );
515 r.append( "-" );
516 }
517 return r;
518 }
519
520
toUnicode(const EString & s)521 UString Utf7Codec::toUnicode( const EString & s )
522 {
523 char shift = '+';
524 if ( broken )
525 shift = '&';
526 UString u;
527 uint i = 0;
528 while ( i < s.length() ) {
529 char c = s[i++];
530 if ( c == shift && s[i] == '-' ) {
531 append( u, shift );
532 i++;
533 }
534 else if ( c == shift ) {
535 c = s[i];
536 uint b = i;
537 EString e;
538 if ( broken ) {
539 EString ohno;
540 while ( ( c >= 'A' && c <= 'Z' ) ||
541 ( c >= 'a' && c <= 'z' ) ||
542 ( c >= '0' && c <= '9' ) ||
543 c == ',' || c == '+' || c == '=' ) {
544 if ( c == ',' )
545 ohno.append( '/' );
546 else
547 ohno.append( c );
548 c = s[++i];
549 }
550 e = ohno.de64();
551 if ( s[i] != '-' && valid() )
552 setState( Invalid );
553 }
554 else {
555 while ( ( c >= 'A' && c <= 'Z' ) ||
556 ( c >= 'a' && c <= 'z' ) ||
557 ( c >= '0' && c <= '9' ) ||
558 c == '/' || c == '+' || c == '=' )
559 c = s[++i];
560 e = s.mid( b, i-b ).de64();
561 }
562 if ( i >= s.length() && wellformed() )
563 setState( BadlyFormed );
564 b = 0;
565 while ( b + 1 < e.length() ) {
566 append( u, 256*e[b] + e[b+1] );
567 b += 2;
568 }
569 while ( b < e.length() && e[b] == '\0' )
570 b++;
571 if ( b < e.length() ) {
572 recordError( b, s );
573 append( u, 0xFFFD );
574 }
575 if ( s[i] == '-' )
576 i++;
577 }
578 else {
579 append( u, c );
580 }
581 }
582 mangleTrailingSurrogate( u );
583 return u;
584 }
585
586
587 /*! This protected helper is used to help MUtf7Codec. The \a unused
588 argument is just that, unused. It's an ugly hack, and I consider
589 it entirely apposite.
590 */
591
Utf7Codec(bool unused)592 Utf7Codec::Utf7Codec( bool unused )
593 : Codec( "MUTF-7" ), broken( true )
594 {
595 unused = unused;
596 }
597
598
599 /*! \class MUtf7Codec utf.h
600
601 The MUtf7Codec class provides the modified UTF-7 encoding
602 described in RFC 3501. It is not used as a Codec in general, only
603 to encode/decode mailbox names by IMAP (and by the database during
604 one schema upgrade).
605 */
606
MUtf7Codec()607 MUtf7Codec::MUtf7Codec()
608 : Utf7Codec( true )
609 {
610 }
611
612
613 //codec UTF-7 Utf7Codec
614 //codec UTF-8 Utf8Codec
615 //codec UTF-16 Utf16Codec
616 //codec UTF-16BE Utf16BeCodec
617 //codec UTF-16LE Utf16LeCodec
618