1 // Copyright 2009 The Archiveopteryx Developers <info@aox.org>
2 
3 #include "utf.h"
4 
5 #include "estring.h"
6 #include "ustring.h"
7 
8 
9 /*! \class Utf8Codec utf.h
10     The Utf8Codec class implements the codec described in RFC 2279
11 
12     This is also the same as in the Unicode book, but this
13     implementation follows RFC 2279.
14 
15     Overlong forms (e.g. 0xC0 Ox80 for U+0000) are allowed by the
16     decoder, but considered badly formed.
17 */
18 
19 /*! Constructs a simple UTF8 decoder/encoder. */
20 
Utf8Codec()21 Utf8Codec::Utf8Codec()
22     : Codec( "UTF-8" ), pgutf( false )
23 {
24 }
25 
26 
27 // from RFC 2279:
28 
29 // UCS-4 range (hex.)    UTF-8 octet sequence (binary)
30 // 0000 0000-0000 007F   0xxxxxxx
31 // 0000 0080-0000 07FF   110xxxxx 10xxxxxx
32 // 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
33 // 0001 0000-001F FFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
34 // 0020 0000-03FF FFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
35 // 0400 0000-7FFF FFFF   1111110x 10xxxxxx ... 10xxxxxx
36 
37 
38 
fromUnicode(const UString & u)39 EString Utf8Codec::fromUnicode( const UString & u )
40 {
41     EString r;
42     r.reserve( u.length() + 40 );
43     uint i = 0;
44     while ( i < u.length() ) {
45         int c = u[i];
46         if ( pgutf && !c ) {
47             // append U+ED00 since postgres cannot store 0 bytes
48             r.append( 0xEE );
49             r.append( 0xB4 );
50             r.append( 0x80 );
51         }
52         else if ( c < 0x80 ) {
53             r.append( (char)c );
54         }
55         else if ( c < 0x800 ) {
56             r.append( 0xc0 | ((char)(c >> 6)) );
57             r.append( 0x80 | ((char)(c & 0x3f)) );
58         }
59         else if ( c < 0x10000 ) {
60             r.append( 0xe0 | ((char)(c >> 12)) );
61             r.append( 0x80 | ((char)(c >> 6) & 0x3f) );
62             r.append( 0x80 | ((char)(c & 0x3f)) );
63         }
64         else if ( c < 0x200000 ) {
65             r.append( 0xf0 | ((char)(c >> 18)) );
66             r.append( 0x80 | ((char)(c >> 12) & 0x3f) );
67             r.append( 0x80 | ((char)(c >> 6) & 0x3f) );
68             r.append( 0x80 | ((char)(c & 0x3f)) );
69         }
70         else if ( c < 0x4000000 ) {
71             r.append( 0xf8 | ((char)(c >> 24)) );
72             r.append( 0x80 | ((char)(c >> 18) & 0x3f) );
73             r.append( 0x80 | ((char)(c >> 12) & 0x3f) );
74             r.append( 0x80 | ((char)(c >> 6) & 0x3f) );
75             r.append( 0x80 | ((char)(c & 0x3f)) );
76         }
77         else if ( c > 0 ) {
78             r.append( 0xfc | ((char)(c >> 30)) );
79             r.append( 0x80 | ((char)(c >> 24) & 0x3f) );
80             r.append( 0x80 | ((char)(c >> 18) & 0x3f) );
81             r.append( 0x80 | ((char)(c >> 12) & 0x3f) );
82             r.append( 0x80 | ((char)(c >> 6) & 0x3f) );
83             r.append( 0x80 | ((char)(c & 0x3f)) );
84         }
85         i++;
86     }
87     return r;
88 }
89 
90 
ahead(const EString & s,int i,uint l)91 static bool ahead( const EString & s, int i, uint l )
92 {
93     int j = i+1;
94     while ( l > 0 ) {
95         if ( (s[j] & 0xc0) != 0x80 )
96             return false;
97         j++;
98         l--;
99     }
100     return true;
101 }
102 
103 
pick(const EString & s,int i,uint l)104 static int pick( const EString & s, int i, uint l )
105 {
106     int a = 0;
107     while ( l > 0 ) {
108         i++;
109         a = (a << 6) | (s[i] & 0x3f);
110         l--;
111     }
112     return a;
113 }
114 
115 /*! Decodes the UTF-8 string \a s and returns the result. */
116 
toUnicode(const EString & s)117 UString Utf8Codec::toUnicode( const EString & s )
118 {
119     UString u;
120     u.reserve( s.length() );
121     uint i = 0;
122     while ( i < s.length() ) {
123         int c = 0;
124         if ( s[i] < 0x80 ) {
125             // 0000 0000-0000 007F   0xxxxxxx
126             c = s[i];
127             i += 1;
128         }
129         else if ( (s[i] & 0xe0) == 0xc0 && ahead( s, i, 1 ) ) {
130             // 0000 0080-0000 07FF   110xxxxx 10xxxxxx
131             c = ((s[i] & 0x1f) << 6) | pick( s, i, 1 );
132             if ( c < 0x80 )
133                 setState( BadlyFormed );
134             i += 2;
135         }
136         else if ( (s[i] & 0xf0) == 0xe0 && ahead( s, i, 2 ) ) {
137             // 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
138             c = ((s[i] & 0x0f) << 12) | pick( s, i, 2 );
139             if ( c < 0x800 )
140                 setState( BadlyFormed );
141             if ( c == 0xED00 && pgutf )
142                 c = 0;
143             i += 3;
144         }
145         else if ( (s[i] & 0xf8) == 0xf0 && ahead( s, i, 3 ) ) {
146             // 0001 0000-001F FFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
147             c = ((s[i] & 0x07) << 18) | pick( s, i, 3 );
148             if ( c < 0x10000 )
149                 setState( BadlyFormed );
150             i += 4;
151         }
152         else if ( (s[i] & 0xfc) == 0xf8 && ahead( s, i, 4 ) ) {
153             // 0020 0000-03FF FFFF   111110xx 10xxxxxx 10xxxxxx ... 10xxxxxx
154             c = ((s[i] & 0x03) << 24) | pick( s, i, 4 );
155             if ( c < 0x200000 )
156                 setState( BadlyFormed );
157             i += 5;
158         }
159         else if ( (s[i] & 0xfe) == 0xfc && ahead( s, i, 5 ) ) {
160             // 0400 0000-7FFF FFFF   1111110x 10xxxxxx ... 10xxxxxx
161             c = ((s[i] & 0x01) << 30) | pick( s, i, 5 );
162             if ( c < 0x4000000 )
163                 setState( BadlyFormed );
164             i += 6;
165         }
166         else {
167             recordError( i, s );
168             c = 0xFFFD;
169             i++;
170         }
171         append( u, c );
172     }
173     mangleTrailingSurrogate( u );
174     return u;
175 }
176 
177 
178 /*! \class PgUtf8Codec utf.h
179     The PgUtf8Codec is a simple modification of Utf8Codec to be able
180     to use PostgreSQL 8.1 well.
181 
182     PostgreSQL 8.1 refuses to store the unicode codepoint 0. The
183     software reports that it is an invalid byte sequence and refers to
184     htt://www.postgresql.org/docs/techdocs.50, but the real reason is
185     that postgresql never was intended to store nulls in text, and
186     versions up to 8.0 allowed it only by accident.
187 
188     Since quite a few messages contain null bytes, we remap 0 to
189     U+ED00 (a private-use codepoint, also used by Unknown8BitCodec)
190     and back.
191 
192     This class is not listed as a supported codec, since it's meant
193     only for postgres use, not for any other purpose.
194 */
195 
196 
197 /*!  Constructs an empty PgUtf8Codec. */
198 
PgUtf8Codec()199 PgUtf8Codec::PgUtf8Codec()
200     : Utf8Codec()
201 {
202     pgutf = true;
203 }
204 
205 
206 
207 
208 /*! \class Utf16Codec utf.h
209     The Utf16Codec implements UTF-16 as specified in RFC 2781.
210 
211     For decoding, Utf16Codec autodetects UTF-16BE or -LE based on the
212     BOM, and for encoding it uses UTF-16BE with a BOM until/unless
213     decoding autodetects UTF-16LE or UTF-16BE without a BOM. In
214     practice it always uses UTF-16BE with a BOM.
215 */
216 
217 
218 /*! Constructs a simple UTF-16 encoder/decoder. For decoding, the
219     backend is autoselected.
220 */
221 
Utf16Codec()222 Utf16Codec::Utf16Codec()
223     : Codec( "UTF-16" ), be( true ), bom( true )
224 {
225     // nothing
226 }
227 
228 
fromUnicode(const UString & u)229 EString Utf16Codec::fromUnicode( const UString & u )
230 {
231     EString r;
232 
233     if ( !bom ) {
234         // if we don't output a BOM, reader should assume BE, so we
235         // must be BE to conform
236         be = true;
237     }
238     else if ( be ) {
239         r.append( 0xfe );
240         r.append( 0xff );
241     }
242     else {
243         r.append( 0xfe );
244         r.append( 0xff );
245     }
246 
247     if ( be )
248         r.append( (new Utf16BeCodec)->fromUnicode( u ) );
249     else
250         r.append( (new Utf16LeCodec)->fromUnicode( u ) );
251 
252     return r;
253 }
254 
255 
toUnicode(const EString & s)256 UString Utf16Codec::toUnicode( const EString & s )
257 {
258     if ( s[0] == 0xFF && s[1] == 0xFE ) {
259         be = false;
260         bom = true;
261     }
262     else if ( s[0] == 0xFE && s[1] == 0xFF ) {
263         be = true;
264         bom = true;
265     }
266     else {
267         be = true;
268         bom = false;
269     }
270 
271     Codec * c = 0;
272     if ( be )
273         c = new Utf16BeCodec;
274     else
275         c = new Utf16LeCodec;
276     UString r = c->toUnicode( s );
277 
278     setState( c->state() );
279     if ( c->state() == Invalid )
280         recordError( c->error() );
281     return r;
282 }
283 
284 
285 /*! \class Utf16LeCodec utf.h
286     The Utf16LeCodec implements UTF-16LE as specified in RFC 2781.
287 
288     Utf16LeCodec removes a BOM while decoding and does not add one
289     while encoding.
290 */
291 
292 
293 /*! Constructs a simple UTF-16LE encoder/decoder.
294 */
295 
296 
Utf16LeCodec()297 Utf16LeCodec::Utf16LeCodec()
298     : Codec( "UTF-16LE" )
299 {
300     // nothing
301 }
302 
303 
fromUnicode(const UString & u)304 EString Utf16LeCodec::fromUnicode( const UString & u )
305 {
306     EString r;
307     r.reserve( u.length() * 2 );
308     uint i = 0;
309     while ( i < u.length() ) {
310         r.append( u[i] % 0x100 );
311         r.append( u[i] / 0x100 );
312         i++;
313     }
314     return r;
315 }
316 
317 
318 /*! toUnicode() is probably a little lax. No. It IS a little lax. We
319     may tighten this later. At least, we can check that \a s has an
320     even length.
321 */
322 
toUnicode(const EString & s)323 UString Utf16LeCodec::toUnicode( const EString & s )
324 {
325     UString u;
326     u.reserve( s.length() / 2 );
327     uint i = 0;
328     while ( i < s.length() ) {
329         uint c = s[i] + 0x100 * s[i+1];
330         if ( !u.isEmpty() || c != 0xFEFF )
331             append( u, c );
332         i += 2;
333     }
334     mangleTrailingSurrogate( u );
335     return u;
336 }
337 
338 
339 /*! \class Utf16BeCodec utf.h
340     The Utf16BeCodec implements UTF-16BE as specified in RFC 2781.
341 
342     Utf16BeCodec removes a BOM while decoding and does not add one
343     while encoding.
344 */
345 
346 
347 /*! Constructs a simple UTF-16BE encoder/decoder.
348 */
349 
350 
Utf16BeCodec()351 Utf16BeCodec::Utf16BeCodec()
352     : Codec( "UTF-16BE" )
353 {
354     // nothing
355 }
356 
357 
fromUnicode(const UString & u)358 EString Utf16BeCodec::fromUnicode( const UString & u )
359 {
360     EString r;
361     r.reserve( u.length() * 2 );
362     uint i = 0;
363     while ( i < u.length() ) {
364         r.append( u[i] / 0x100 );
365         r.append( u[i] % 0x100 );
366         i++;
367     }
368     return r;
369 }
370 
371 
372 /*! toUnicode() is probably a little lax. No. It IS a little lax. We
373     may tighten this later. At least, we can check that \a s has an
374     even length.
375 */
376 
toUnicode(const EString & s)377 UString Utf16BeCodec::toUnicode( const EString & s )
378 {
379     UString u;
380     u.reserve( s.length() / 2 );
381     uint i = 0;
382     while ( i < s.length() ) {
383         uint c = s[i] * 0x100 + s[i+1];
384         if ( !u.isEmpty() || c != 0xFEFF )
385             append( u, c );
386         i += 2;
387     }
388     mangleTrailingSurrogate( u );
389     return u;
390 }
391 
392 
393 /*! \class Utf7Codec utf.h
394 
395     The Utf7Codec class provides conversion to and from the UTF-7
396     encoding specified in RFC 2152. It's almost entirely unused,
397     except that some IMAP clients use its mUTF7 variation. It is
398     implemented here so that we can more easily implement mUTF7.
399 */
400 
401 
402 /*! Constructs a plain UTF-7 decoder/encoder.
403 
404 */
405 
Utf7Codec()406 Utf7Codec::Utf7Codec()
407     : Codec( "UTF-7" ), broken( false )
408 {
409 }
410 
411 
412 /*! This private helper returns the "correct" base64 encoding of \a u,
413     including the special case for "+".
414 */
415 
e(const UString & u)416 EString Utf7Codec::e( const UString & u )
417 {
418     if ( u.length() == 1 &&
419          u[0] == ( broken ? '&' : '+' ) )
420         return "";
421 
422     EString t;
423     uint i = 0;
424     while ( i < u.length() ) {
425         uint c = u[i];
426         t.append( c / 256 );
427         t.append( c % 256 );
428         i++;
429     }
430     EString e = t.e64().mid( 0, ( i * 16 + 5 ) / 6 );
431     if ( !broken )
432         return e;
433     EString b;
434     i = 0;
435     while ( i < e.length() ) {
436         if ( e[i] == '/' )
437             b.append( ',' );
438         else
439             b.append( e[i] );
440         ++i;
441     }
442     return b;
443 }
444 
445 
fromUnicode(const UString & u)446 EString Utf7Codec::fromUnicode( const UString & u )
447 {
448     UString u16;
449     uint i = 0;
450     while ( i < u.length() ) {
451         if ( u[i] < 0x10000 ) {
452             u16.append( u[i] );
453         }
454         else {
455             u16.append( 0xD800 + ( ( u[i] - 0x10000 ) >> 10 ) );
456             u16.append( 0xDC00 + ( ( u[i] - 0x10000 ) & 0x3ff ) );
457         }
458         i++;
459     }
460     i = 0;
461     EString r;
462     uint b = UINT_MAX;
463     while ( i < u16.length() ) {
464         uint c = u16[i];
465         if ( c < 128 &&
466              ( ( c >= 'A' && c <= 'Z' ) ||
467                ( c >= 'a' && c <= 'z' ) ||
468                ( c >= '0' && c <= '9' ) ||
469 // Set D (directly encoded characters) consists of the following
470 // characters (derived from RFC 1521, Appendix B, which no longer
471 // appears in RFC 2045): the upper and lower case letters A through Z
472 // and a through z, the 10 digits 0-9, and the following nine special
473 // characters (note that "+" and "=" are omitted):
474                c == '\'' || c == '(' || c == ')' || c == ',' ||
475                c == '-' || c == '.' || c == '/' || c == ':' ||
476                c == '?' ||
477 // Rule 3: The space (decimal 32), tab (decimal 9), carriage return
478 // (decimal 13), and line feed (decimal 10) characters may be
479 // directly represented by their ASCII equivalents.
480                c == ' ' || c == 9 || c == 13 ||
481 // Set O (optional direct characters) consists of the following
482 // characters (note that "\" and "~" are omitted):
483                c == '!' || c == '"' || c == '#' || c == '$' ||
484                c == '%' ||             c == '*' || c == ';' ||
485                c == '<' || c == '=' || c == '>' || c == '@' ||
486                c == '[' || c == ']' || c == '^' || c == '_' ||
487                c == '`' || c == '{' || c == '|' || c == '}' ||
488 // MUTF-7 removes & from set O, and adds +
489                c == ( broken ? '+' : '&' ) ) ) {
490             if ( b < i ) {
491                 r.append( e( u16.mid( b, i - b ) ) );
492                 b = UINT_MAX;
493                 if ( ( c >= 'A' && c <= 'Z' ) ||
494                      ( c >= 'a' && c <= 'z' ) ||
495                      ( c >= '0' && c <= '9' ) ||
496                      c == '/' || c == '+' || c == '-' ||
497                      broken )
498                     r.append( "-" );
499             }
500             r.append( c );
501         }
502         else {
503             if ( b > i ) {
504                 if ( broken )
505                     r.append( '&' );
506                 else
507                     r.append( '+' );
508                 b = i;
509             }
510         }
511         ++i;
512     }
513     if ( b < i ) {
514         r.append( e( u16.mid( b ) ) );
515         r.append( "-" );
516     }
517     return r;
518 }
519 
520 
toUnicode(const EString & s)521 UString Utf7Codec::toUnicode( const EString & s )
522 {
523     char shift = '+';
524     if ( broken )
525         shift = '&';
526     UString u;
527     uint i = 0;
528     while ( i < s.length() ) {
529         char c = s[i++];
530         if ( c == shift && s[i] == '-' ) {
531             append( u, shift );
532             i++;
533         }
534         else if ( c == shift ) {
535             c = s[i];
536             uint b = i;
537             EString e;
538             if ( broken ) {
539                 EString ohno;
540                 while ( ( c >= 'A' && c <= 'Z' ) ||
541                         ( c >= 'a' && c <= 'z' ) ||
542                         ( c >= '0' && c <= '9' ) ||
543                         c == ',' || c == '+' || c == '=' ) {
544                     if ( c == ',' )
545                         ohno.append( '/' );
546                     else
547                         ohno.append( c );
548                     c = s[++i];
549                 }
550                 e = ohno.de64();
551                 if ( s[i] != '-' && valid() )
552                     setState( Invalid );
553             }
554             else {
555                 while ( ( c >= 'A' && c <= 'Z' ) ||
556                         ( c >= 'a' && c <= 'z' ) ||
557                         ( c >= '0' && c <= '9' ) ||
558                         c == '/' || c == '+' || c == '=' )
559                     c = s[++i];
560                 e = s.mid( b, i-b ).de64();
561             }
562             if ( i >= s.length() && wellformed() )
563                 setState( BadlyFormed );
564             b = 0;
565             while ( b + 1 < e.length() ) {
566                 append( u, 256*e[b] + e[b+1] );
567                 b += 2;
568             }
569             while ( b < e.length() && e[b] == '\0' )
570                 b++;
571             if ( b < e.length() ) {
572                 recordError( b, s );
573                 append( u, 0xFFFD );
574             }
575             if ( s[i] == '-' )
576                 i++;
577         }
578         else {
579             append( u, c );
580         }
581     }
582     mangleTrailingSurrogate( u );
583     return u;
584 }
585 
586 
587 /*! This protected helper is used to help MUtf7Codec. The \a unused
588     argument is just that, unused. It's an ugly hack, and I consider
589     it entirely apposite.
590 */
591 
Utf7Codec(bool unused)592 Utf7Codec::Utf7Codec( bool unused )
593     : Codec( "MUTF-7" ), broken( true )
594 {
595     unused = unused;
596 }
597 
598 
599 /*! \class MUtf7Codec utf.h
600 
601     The MUtf7Codec class provides the modified UTF-7 encoding
602     described in RFC 3501. It is not used as a Codec in general, only
603     to encode/decode mailbox names by IMAP (and by the database during
604     one schema upgrade).
605 */
606 
MUtf7Codec()607 MUtf7Codec::MUtf7Codec()
608     : Utf7Codec( true )
609 {
610 }
611 
612 
613 //codec UTF-7 Utf7Codec
614 //codec UTF-8 Utf8Codec
615 //codec UTF-16 Utf16Codec
616 //codec UTF-16BE Utf16BeCodec
617 //codec UTF-16LE Utf16LeCodec
618