1#include <stdlib.h>
2
3#include "cunit/cunit.h"
4#include "charset.h"
5
6extern int charset_debug;
7
8/* The Unicode Replacement character 0xfffd in UTF-8 encoding */
9#define UTF8_REPLACEMENT    "\357\277\275"
10/* The Replacement char after search normalisation */
11#define SEARCH_REPLACEMENT  "\377"
12
13static void test_lookupname(void)
14{
15    charset_t cs, cs2;
16
17    /* us-ascii must exist */
18    cs = charset_lookupname("us-ascii");
19    CU_ASSERT_PTR_NOT_NULL(cs);
20
21    /* names are case-insensitive */
22    cs2 = charset_lookupname("US-ASCII");
23    CU_ASSERT_PTR_NOT_NULL(cs2);
24    CU_ASSERT_STRING_EQUAL(charset_name(cs), charset_name(cs2));
25    charset_free(&cs2);
26
27    cs2 = charset_lookupname("Us-AsCiI");
28    CU_ASSERT_PTR_NOT_NULL(cs2);
29    CU_ASSERT_STRING_EQUAL(charset_name(cs), charset_name(cs2));
30    charset_free(&cs2);
31    charset_free(&cs);
32
33    /* some others must also exist */
34    cs = charset_lookupname("utf-8");
35    CU_ASSERT_PTR_NOT_NULL(cs);
36    charset_free(&cs);
37
38    cs = charset_lookupname("utf-7");
39    CU_ASSERT_PTR_NOT_NULL(cs);
40    charset_free(&cs);
41
42    cs = charset_lookupname("iso-8859-1");
43    CU_ASSERT_PTR_NOT_NULL(cs);
44    charset_free(&cs);
45
46    /*
47     * Assert that ICU-backed charsets return the Cyrus-canonical name. E.g:
48     * ICU names windows-1252 as ibm-5348_P100-1997 but we don't want that.
49     */
50    cs = charset_lookupname("windows-1252");
51    CU_ASSERT_PTR_NOT_NULL(cs);
52    CU_ASSERT_STRING_EQUAL(charset_name(cs), "windows-1252");
53    charset_free(&cs);
54
55    cs = charset_lookupname("1252");
56    CU_ASSERT_PTR_NOT_NULL(cs);
57    CU_ASSERT_STRING_EQUAL(charset_name(cs), "windows-1252");
58    charset_free(&cs);
59
60    /* But still use the ICU name if there's no good alias for it */
61    cs = charset_lookupname("ebcdic-ar");
62    CU_ASSERT_PTR_NOT_NULL(cs);
63    CU_ASSERT_STRING_EQUAL(charset_name(cs), "ibm-16804_X110-1999");
64    charset_free(&cs);
65
66}
67
68static void test_to_utf8(void)
69{
70    charset_t cs;
71    char *s;
72    static const char ASCII_1[] = "Hello World";
73    static const char ASCII_2[] = "Hello W\370rld";
74    static const char UTF8_2[] = "Hello W" UTF8_REPLACEMENT "rld";
75    static const char BASE64_3[] = "SGVsbG8gV29ybGQ=";
76    static const char QP_4[] =
77"If you believe that truth=3Dbeauty, then surely=20=\r\n"
78"mathematics is the most beautiful branch of philosophy.\r\n";
79    static const char ASCII_4[] =
80"If you believe that truth=beauty, then surely "
81"mathematics is the most beautiful branch of philosophy.\r\n";
82
83    cs = charset_lookupname("us-ascii");
84    CU_ASSERT_PTR_NOT_NULL(cs);
85
86    /* zero length input */
87    s = charset_to_utf8("", 0, cs, ENCODING_NONE);
88    CU_ASSERT_PTR_NOT_NULL(s);
89    CU_ASSERT_STRING_EQUAL(s, "");
90    free(s);
91
92    /* invalid encoding */
93    s = charset_to_utf8(ASCII_1, sizeof(ASCII_1), cs, 0xdeadbeef);
94    CU_ASSERT_PTR_NULL(s);
95
96    /* invalid charset */
97    s = charset_to_utf8(ASCII_1, sizeof(ASCII_1), NULL, ENCODING_NONE);
98    CU_ASSERT_PTR_NULL(s);
99
100    /* simple ASCII string */
101    s = charset_to_utf8(ASCII_1, sizeof(ASCII_1)-1, cs, ENCODING_NONE);
102    CU_ASSERT_PTR_NOT_NULL(s);
103    CU_ASSERT_STRING_EQUAL(s, ASCII_1);
104    free(s);
105
106    /* ASCII string with an invalid character */
107    s = charset_to_utf8(ASCII_2, sizeof(ASCII_2)-1, cs, ENCODING_NONE);
108    CU_ASSERT_PTR_NOT_NULL(s);
109    CU_ASSERT_STRING_EQUAL(s, UTF8_2);
110    free(s);
111
112    /* base64 encoding */
113    s = charset_to_utf8(BASE64_3, sizeof(BASE64_3)-1, cs, ENCODING_BASE64);
114    CU_ASSERT_PTR_NOT_NULL(s);
115    CU_ASSERT_STRING_EQUAL(s, ASCII_1);
116    free(s);
117
118    /* Quoted-printable encoding */
119    s = charset_to_utf8(QP_4, sizeof(QP_4)-1, cs, ENCODING_QP);
120    CU_ASSERT_PTR_NOT_NULL(s);
121    CU_ASSERT_STRING_EQUAL(s, ASCII_4);
122    free(s);
123
124    charset_free(&cs);
125}
126
127static void test_to_imaputf7(void)
128{
129    charset_t csu8 = charset_lookupname("utf-8");
130    CU_ASSERT_PTR_NOT_NULL(csu8);
131    charset_t csu7 = charset_lookupname("imap-utf-7");
132    CU_ASSERT_PTR_NOT_NULL(csu7);
133
134#define TESTCASE(in, want) \
135    { \
136        char *s; \
137        char *q; \
138        static const char _in[] = (in); \
139        static const char _want[] = (want); \
140        s = charset_to_imaputf7(_in, strlen(_in), csu8, ENCODING_NONE); \
141        CU_ASSERT_PTR_NOT_NULL(s); \
142        CU_ASSERT_STRING_EQUAL(s, _want); \
143        q = charset_to_utf8(s, strlen(s), csu7, ENCODING_NONE); \
144        CU_ASSERT_PTR_NOT_NULL(q); \
145        CU_ASSERT_STRING_EQUAL(q, _in); \
146        free(q); \
147        free(s); \
148    }
149
150    /* Plain IMAP UTF-7 */
151    TESTCASE("Hello, World", "Hello, World");
152
153    /* Escaped ampersand */
154    TESTCASE("Laurel&Hardy", "Laurel&-Hardy");
155
156    /* LATIN SMALL LETTER O WITH DIAERESIS (U+00F6) */
157    TESTCASE("Tr""\xC3\xB6""del", "Tr&APY-del");
158
159    /* LATIN SMALL LETTER E WITH ACUTE (U+00E9) */
160    TESTCASE("R""\xC3\xA9""pertoire", "R&AOk-pertoire");
161
162    /* WHITE SMILING FACE' (U+263A) */
163    TESTCASE("Hi Mom \xE2\x98\xBA!", "Hi Mom &Jjo-!");
164
165    /* WHITE SMILING FACE' (U+263A) at end */
166    TESTCASE("Hi Mom \xE2\x98\xBA", "Hi Mom &Jjo-");
167
168    /* DESERET SMALL LETTER YEE (U+10437) & HAN Character (U+24B62) */
169    TESTCASE("\xF0\x90\x90\xB7""&""\xF0\xA4\xAD\xA2", "&2AHcNw-&-&2FLfYg-");
170
171    /* CARRIAGE RETURN (CR) (U+000D) LINE FEED (LF) (U+000A) */
172    TESTCASE("\x0D\x0A", "&AA0ACg-");
173
174#undef TESTCASE
175
176    charset_free(&csu8);
177    charset_free(&csu7);
178}
179
180static void test_misc_charsets(void)
181{
182#define TESTCASE(alias, in, want) \
183    { \
184        char *s; \
185        static const char _in[] = (in); \
186        static const char _want[] = (want); \
187        charset_t cs = charset_lookupname(alias); \
188        CU_ASSERT_PTR_NOT_NULL(cs); \
189        s = charset_to_utf8(_in, strlen(_in), cs, ENCODING_NONE); \
190        CU_ASSERT_PTR_NOT_NULL(s); \
191        CU_ASSERT_STRING_EQUAL(s, _want); \
192        free(s); \
193        charset_free(&cs); \
194    }
195
196    /* MSDOS Latin1 aka CP-850 */
197    TESTCASE("cp850", "Hello, World", "Hello, World");
198    TESTCASE("cp850", "fa""\x87""ade", "fa""\xc3\xa7""ade");
199
200    /* Windows-31J aka CP-932 */
201    TESTCASE("windows-31J", "Hello, World", "Hello, World");
202    TESTCASE("cp932", "Hello, ""\x90\xa2\x8a\x45",
203                      "Hello, ""\xe4\xb8\x96\xe7\x95\x8c");
204
205    /* Windows-936 aka CP-936 */
206    TESTCASE("windows-936", "Hello, World", "Hello, World");
207    TESTCASE("cp936", "\xC4\xE3\xBA\xC3\xA3\xAC\xCA\xC0\xBD\xE7",
208                      "\xE4\xBD\xA0\xE5\xA5\xBD\xEF\xBC\x8C\xE4"
209                      "\xB8\x96\xE7\x95\x8C");
210
211    /* Windows-1257 aka CP-1257 */
212    TESTCASE("windows-1257", "Hello, World", "Hello, World");
213    TESTCASE("cp1257", "\xe0\xd8\xc2", "\xC4\x85\xC5\xB2\xC4\x80");
214
215    /* KOI8-U */
216    TESTCASE("koi8-u", "Hello, World", "Hello, World");
217    TESTCASE("koi8-u", "\xA4\xA6\xA7\xAD\xB4\xB6\xB7\xBD",
218                       "\xD1\x94\xD1\x96\xD1\x97\xD2\x91"
219                       "\xD0\x84\xD0\x86\xD0\x87\xD2\x90");
220
221#undef TESTCASE
222}
223
224static void test_qp(void)
225{
226    /* corner cases in Quoted-Printable */
227#define TESTCASE(in, cs, enc, exp) \
228    { \
229        static const char _in[] = (in); \
230        static const char _exp[] = (exp); \
231        charset_t _cs = charset_lookupname(cs); \
232        CU_ASSERT_PTR_NOT_NULL(_cs); \
233        int _enc = (enc); \
234        char *s = charset_to_utf8(_in, sizeof(_in)-1, _cs, _enc); \
235        CU_ASSERT_PTR_NOT_NULL(s); \
236        CU_ASSERT_STRING_EQUAL(s, _exp); \
237        free(s); \
238        charset_free(&_cs); \
239    }
240
241    /* encoding of SP */
242    TESTCASE("ab=20xy", "us-ascii", ENCODING_QP, "ab xy");
243
244    /* encoding of '=' */
245    TESTCASE("ab=3Dxy", "us-ascii", ENCODING_QP, "ab=xy");
246
247    /* lowercase also */
248    TESTCASE("ab=3dxy", "us-ascii", ENCODING_QP, "ab=xy");
249
250    /* underscore is not special outside of headers */
251    TESTCASE("ab_xy", "us-ascii", ENCODING_QP, "ab_xy");
252
253    /* invalid characters after = are passed through
254     * even if one of them is a valid hexchar */
255    TESTCASE("ab=ZZxy", "us-ascii", ENCODING_QP, "ab=ZZxy");
256    TESTCASE("ab=ZCxy", "us-ascii", ENCODING_QP, "ab=ZCxy");
257    TESTCASE("ab=CZxy", "us-ascii", ENCODING_QP, "ab=CZxy");
258    TESTCASE("ab=Zcxy", "us-ascii", ENCODING_QP, "ab=Zcxy");
259    TESTCASE("ab=cZxy", "us-ascii", ENCODING_QP, "ab=cZxy");
260
261    /* soft line break */
262    TESTCASE("ab=\r\nxy", "us-ascii", ENCODING_QP, "abxy");
263
264#undef TESTCASE
265}
266
267static void test_encode_mimeheader(void)
268{
269    /* corner cases in Quoted-Printable */
270#define TESTCASE(in, exp) \
271    { \
272        static const char _in[] = (in); \
273        static const char _exp[] = (exp); \
274        char *s = charset_encode_mimeheader(_in, 0); \
275        CU_ASSERT_PTR_NOT_NULL(s); \
276        CU_ASSERT_STRING_EQUAL(s, _exp); \
277        const char *p, *lf; \
278        for (lf = s, p = s; *p != '\0'; p++) { \
279            if (*p == '\n') { \
280                CU_ASSERT(p - lf <= 76); \
281                lf = p; \
282            } \
283        } \
284        CU_ASSERT(p - lf <= 76); \
285        free(s); \
286    }
287
288    TESTCASE("abc", "abc");
289
290    TESTCASE("abc\r\n", "=?UTF-8?Q?abc?=");
291
292    /* bogus indent */
293    TESTCASE("abc\r\nxyz", "=?UTF-8?Q?abc?=\r\n =?UTF-8?Q?xyz?=");
294
295    /* wrap */
296    TESTCASE("abc\r\n xyz", "=?UTF-8?Q?abc?=\r\n =?UTF-8?Q?xyz?=");
297
298    /* three-byte UTF-8 word barely fits line length limit */
299    TESTCASE("0123456789012345678901234567890123456789012345678901234\xe2\x82\xac",
300            "=?UTF-8?Q?0123456789012345678901234567890123456789012345678901234=E2=82=AC?=");
301
302    /* three-byte UTF-8 word must not be split */
303    TESTCASE("01234567890123456789012345678901234567890123456789012345\xe2\x82\xac",
304            "=?UTF-8?Q?01234567890123456789012345678901234567890123456789012345?="
305            "\r\n ""=?UTF-8?Q?=E2=82=AC?=");
306
307#undef TESTCASE
308}
309
310
311static void test_decode_mimeheader(void)
312{
313    char *s;
314    static const char ASCII_1[] = "Lorem IPSUM dolor \t \t  sit amet";
315    static const char SEARCH_1[] = "LOREM IPSUM DOLOR SIT AMET";
316    static const char ASCII_B64_2[] = "Lorem =?us-ascii?q?ipsum?= dolor "
317                                      "=?US-ASCII?Q?sit amet?=";
318    static const char ASCII_B64_3[] = "Lorem =?iso-8859-1?q?ips=fcm?= \t"
319                                      "DOLOR =?iso-8859-1?Q?s=eft am=ebt?=";
320    static const char SEARCH_3[] = "LOREM IPSUM DOLOR SIT AMET";
321    static const char SEARCH_3b[] = "LOREM IPSÜM DOLOR SÏT AMËT";
322    static const char SEARCH_3c[] = "LOREMIPSUMDOLORSITAMET";
323    static const char SEARCH_3d[] = "LOREMIPSÜMDOLORSÏTAMËT";
324    static const char SEARCH_3e[] = "LOREM IPSÜM  DOLOR SÏT AMËT";
325    static const char HTML_4[] = "=?utf-8?q?=C2=A1<em>Hola</em>,_se=C3=B1or!?=";
326    static const char HTML_4a[] = "¡<EM>HOLA</EM>, SEN""\xcc\x83""OR!";
327    static const char HTML_4b[] = "¡<EM>HOLA</EM>, SENOR!";
328    static const char HTML_4c[] = "¡<em>Hola</em>, señor!";
329    static const char HTML_4d[] = "¡&lt;em&gt;Hola&lt;/em&gt;, señor!";
330    int flags = CHARSET_SKIPDIACRIT | CHARSET_MERGESPACE; /* default */
331
332    s = charset_decode_mimeheader(NULL, flags);
333    CU_ASSERT_PTR_NULL(s);
334    free(s);
335
336    s = charset_decode_mimeheader("", flags);
337    CU_ASSERT_PTR_NOT_NULL(s);
338    CU_ASSERT_STRING_EQUAL(s, "");
339    free(s);
340
341    s = charset_decode_mimeheader(ASCII_1, flags);
342    CU_ASSERT_PTR_NOT_NULL(s);
343    CU_ASSERT_STRING_EQUAL(s, SEARCH_1);
344    free(s);
345
346    s = charset_decode_mimeheader(ASCII_B64_2, flags);
347    CU_ASSERT_PTR_NOT_NULL(s);
348    CU_ASSERT_STRING_EQUAL(s, SEARCH_1);
349    free(s);
350
351    s = charset_decode_mimeheader(ASCII_B64_3, flags);
352    CU_ASSERT_PTR_NOT_NULL(s);
353    CU_ASSERT_STRING_EQUAL(s, SEARCH_3);
354    free(s);
355
356    flags = CHARSET_MERGESPACE;
357    s = charset_decode_mimeheader(ASCII_B64_3, flags);
358    CU_ASSERT_PTR_NOT_NULL(s);
359    CU_ASSERT_STRING_EQUAL(s, SEARCH_3b);
360    free(s);
361
362    flags = CHARSET_SKIPSPACE | CHARSET_SKIPDIACRIT;
363    s = charset_decode_mimeheader(ASCII_B64_3, flags);
364    CU_ASSERT_PTR_NOT_NULL(s);
365    CU_ASSERT_STRING_EQUAL(s, SEARCH_3c);
366    free(s);
367
368    flags = CHARSET_SKIPSPACE;
369    s = charset_decode_mimeheader(ASCII_B64_3, flags);
370    CU_ASSERT_PTR_NOT_NULL(s);
371    CU_ASSERT_STRING_EQUAL(s, SEARCH_3d);
372    free(s);
373
374    flags = 0;
375    s = charset_decode_mimeheader(ASCII_B64_3, flags);
376    CU_ASSERT_PTR_NOT_NULL(s);
377    CU_ASSERT_STRING_EQUAL(s, SEARCH_3e);
378    free(s);
379
380    flags = 0;
381    s = charset_decode_mimeheader(HTML_4, flags);
382    CU_ASSERT_PTR_NOT_NULL(s);
383    CU_ASSERT_STRING_EQUAL(s, HTML_4a);
384    free(s);
385
386    flags = CHARSET_SKIPDIACRIT;
387    s = charset_decode_mimeheader(HTML_4, flags);
388    CU_ASSERT_PTR_NOT_NULL(s);
389    CU_ASSERT_STRING_EQUAL(s, HTML_4b);
390    free(s);
391
392    flags = CHARSET_SNIPPET;
393    s = charset_decode_mimeheader(HTML_4, flags);
394    CU_ASSERT_PTR_NOT_NULL(s);
395    CU_ASSERT_STRING_EQUAL(s, HTML_4c);
396    free(s);
397
398    flags = CHARSET_SNIPPET|CHARSET_ESCAPEHTML;
399    s = charset_decode_mimeheader(HTML_4, flags);
400    CU_ASSERT_PTR_NOT_NULL(s);
401    CU_ASSERT_STRING_EQUAL(s, HTML_4d);
402    free(s);
403
404    static const char ASCII_EUC_KR[] = "A =?EUC-KR?B?wMzIo8Dn?= B";
405    static const char SEARCH_EUC_KR[] = "A""\x20\xec\x9d\xb4\xed\x98\xb8\xec\x9e\xac""B";
406    flags = CHARSET_SKIPDIACRIT | CHARSET_MERGESPACE; /* default */
407    s = charset_decode_mimeheader(ASCII_EUC_KR, flags);
408    CU_ASSERT_PTR_NOT_NULL(s);
409    CU_ASSERT_STRING_EQUAL(s, SEARCH_EUC_KR);
410    free(s);
411}
412
413static void test_parse_mimeheader(void)
414{
415    char *s;
416    static const char ASCII[] = "Lorem IPSUM";
417    static const char UTF8[] = "=?utf-8?q?=C2=A1Hola,_se=C3=B1or!?= Lorem IPSÜM";
418    static const char LATIN1[] = "=?ISO-8859-1?q?Caf=E9?= Lorem IPS""\xDC""M";
419
420    static const char UTF8_1[] = "¡Hola, señor! Lorem IPS" UTF8_REPLACEMENT UTF8_REPLACEMENT "M";
421    static const char UTF8_2[] = "¡Hola, señor! Lorem IPSÜM";
422    static const char LATIN1_1[] = "Café Lorem IPS" UTF8_REPLACEMENT "M";
423
424    int flags = 0; /* default */
425
426    s = charset_parse_mimeheader(NULL, flags);
427    CU_ASSERT_PTR_NULL(s);
428    free(s);
429
430    s = charset_parse_mimeheader("", flags);
431    CU_ASSERT_PTR_NOT_NULL(s);
432    CU_ASSERT_STRING_EQUAL(s, "");
433    free(s);
434
435    s = charset_parse_mimeheader(ASCII, flags);
436    CU_ASSERT_PTR_NOT_NULL(s);
437    CU_ASSERT_STRING_EQUAL(s, ASCII);
438    free(s);
439
440    s = charset_parse_mimeheader(UTF8, flags);
441    CU_ASSERT_PTR_NOT_NULL(s);
442    CU_ASSERT_STRING_EQUAL(s, UTF8_1);
443    free(s);
444
445    s = charset_parse_mimeheader(LATIN1, flags);
446    CU_ASSERT_PTR_NOT_NULL(s);
447    CU_ASSERT_STRING_EQUAL(s, LATIN1_1);
448    free(s);
449
450    flags = CHARSET_MIME_UTF8;
451
452    s = charset_parse_mimeheader(ASCII, flags);
453    CU_ASSERT_PTR_NOT_NULL(s);
454    CU_ASSERT_STRING_EQUAL(s, ASCII);
455    free(s);
456
457    s = charset_parse_mimeheader(UTF8, flags);
458    CU_ASSERT_PTR_NOT_NULL(s);
459    CU_ASSERT_STRING_EQUAL(s, UTF8_2);
460    free(s);
461
462    s = charset_parse_mimeheader(LATIN1, flags);
463    CU_ASSERT_PTR_NOT_NULL(s);
464    CU_ASSERT_STRING_EQUAL(s, LATIN1_1);
465    free(s);
466}
467
468static void test_mimeheader_badcharset(void)
469{
470    /* when given an unknown charset, the entire word is
471     * replaced with a single Unicode replacement char */
472    char *s;
473    static const char ASCII_1[] = "A =?foo?B?wMzIo8Dn?= B";
474    static const char SEARCH_1[] = "A " UTF8_REPLACEMENT "B";
475    int flags = CHARSET_SKIPDIACRIT | CHARSET_MERGESPACE; /* default */
476
477    s = charset_decode_mimeheader(ASCII_1, flags);
478    CU_ASSERT_PTR_NOT_NULL(s);
479    CU_ASSERT_STRING_EQUAL(s, SEARCH_1);
480    free(s);
481}
482
483static void test_unfold(void)
484{
485#define TESTCASE(in, wantSkip, wantKeep) \
486    { \
487        char *s; \
488        char *k; \
489        static const char _in[] = (in); \
490        static const char _wantSkip[] = (wantSkip); \
491        static const char _wantKeep[] = (wantKeep); \
492        s = charset_unfold(_in, strlen(_in), CHARSET_UNFOLD_SKIPWS); \
493        CU_ASSERT_PTR_NOT_NULL(s); \
494        CU_ASSERT_STRING_EQUAL(s, _wantSkip); \
495        k = charset_unfold(_in, strlen(_in), 0); \
496        CU_ASSERT_PTR_NOT_NULL(k); \
497        CU_ASSERT_STRING_EQUAL(k, _wantKeep); \
498        free(k); \
499        free(s); \
500    }
501
502    /* Single line */
503    TESTCASE("abcdef", "abcdef", "abcdef");
504
505    /* Single line, ending in CRLF */
506    TESTCASE("abcdef\r\n", "abcdef", "abcdef");
507
508    /* Two lines */
509    TESTCASE("abc\r\ndef", "abc\r\ndef", "abc\r\ndef");
510
511    /* Two lines, first with continuation line */
512    TESTCASE("ab\r\n c\r\ndef", "abc\r\ndef", "ab c\r\ndef");
513
514    /* Two lines, both with continuation lines */
515    TESTCASE("a\r\n\t\r\n b\r\n c\r\nd\r\n ef", "abc\r\ndef", "a\t b c\r\nd ef");
516
517    /* One long, empty continuation line */
518    /* Typically, RFCs using unfolding forbid this case. */
519    TESTCASE("\r\n\t\r\n \r\n \r\n", "", "\t  ");
520
521#undef TESTCASE
522}
523
524static void test_mime_unfold(void)
525{
526    char *s;
527
528    /* Test unfolding and the 'keep' space option.  Note that 'keep' is
529     * a bit of a misnomer, it actually converts whitespace characters
530     * to SP before keeping the same *number* of chars, which is
531     * actually quite unhelpful.
532     */
533    s = charset_decode_mimeheader(
534"From: foo@bar\r\n"
535"To: baz@quux\r\n"
536"Subject: this\r\n"
537"\tline is continued\r\n"
538"Keywords: and\r\n"
539"\tso is\r\n"
540" this one\r\n"
541"\r\n",
542    CHARSET_SKIPDIACRIT);
543    CU_ASSERT_STRING_EQUAL(s,
544"FROM: FOO@BAR  "
545"TO: BAZ@QUUX  "
546"SUBJECT: THIS LINE IS CONTINUED  "
547"KEYWORDS: AND SO IS THIS ONE  "
548"  "
549    );
550    free(s);
551
552    /* test unfolding and the 'merge' space option which merges any
553     * amount of whitespace down to a single SP character */
554    s = charset_decode_mimeheader(
555"From: foo@bar\r\n"
556"To: baz@quux\r\n"
557"Subject: this\r\n"
558"\tline is continued\r\n"
559"Keywords: and\r\n"
560"\tso is\r\n"
561" this one\r\n"
562"\r\n",
563    CHARSET_SKIPDIACRIT|CHARSET_MERGESPACE);
564    CU_ASSERT_STRING_EQUAL(s,
565"FROM: FOO@BAR "
566"TO: BAZ@QUUX "
567"SUBJECT: THIS LINE IS CONTINUED "
568"KEYWORDS: AND SO IS THIS ONE "
569    );
570    free(s);
571
572    /* test unfolding and the 'skip' space option which elides
573     * all whitespace. */
574    s = charset_decode_mimeheader(
575"From: foo@bar\r\n"
576"To: baz@quux\r\n"
577"Subject: this\r\n"
578"\tline is continued\r\n"
579"Keywords: and\r\n"
580"\tso is\r\n"
581" this one\r\n"
582"\r\n",
583    CHARSET_SKIPDIACRIT|CHARSET_SKIPSPACE);
584    CU_ASSERT_STRING_EQUAL(s,
585"FROM:FOO@BAR"
586"TO:BAZ@QUUX"
587"SUBJECT:THISLINEISCONTINUED"
588"KEYWORDS:ANDSOISTHISONE"
589    );
590    free(s);
591}
592
593static void test_search_mimeheader(void)
594{
595    char *s;
596    comp_pat *pat;
597    static const char SUBJECT_CP1252[] = "=?Cp1252?Q?Herzlichen_Gl=FCckwunsch,_der_Artikel_Canon_Ob?= "
598                                         "=?Cp1252?Q?jektiv_EF-S_18-55_mm_1:3,5-5,6_geh=F6rt_Ihnen!?=";
599    static const char SEARCH_CP1252[] = "Herzlichen";
600    int flags = CHARSET_SKIPDIACRIT | CHARSET_MERGESPACE; /* default */
601    charset_t cs = charset_lookupname("us-ascii");
602
603    s = charset_convert(SEARCH_CP1252, cs, flags);
604    pat = charset_compilepat(s);
605    CU_ASSERT(charset_search_mimeheader(s, pat, SUBJECT_CP1252, flags));
606    charset_freepat(pat);
607    charset_free(&cs);
608    free(s);
609}
610
611static void test_rfc5051(void)
612{
613    /* Example: codepoint U+01C4 (LATIN CAPITAL LETTER DZ WITH CARON)
614     * has a titlecase property of U+01C5 (LATIN CAPITAL LETTER D
615     * WITH SMALL LETTER Z WITH CARON).  Codepoint U+01C5 has a
616     * decomposition property of U+0044 (LATIN CAPITAL LETTER D)
617     * U+017E (LATIN SMALL LETTER Z WITH CARON).  U+017E has a
618     * decomposition property of U+007A (LATIN SMALL LETTER Z) U+030c
619     */
620    char *s;
621    static const char STR_RFC5051[] = {0xc7, 0x84, 0};
622    static const char RES_RFC5051[] = {'D', 'z', 0xcc, 0x8c, 0};
623    int flags = 0; /* super compliant */
624    charset_t cs;
625
626    cs = charset_lookupname("utf-8");
627    s = charset_convert(STR_RFC5051, cs, flags);
628    CU_ASSERT_PTR_NOT_NULL(s);
629    CU_ASSERT_STRING_EQUAL(s, RES_RFC5051);
630    charset_free(&cs);
631    free(s);
632}
633
634struct text_rock {
635    int ncalls;
636    struct buf out;
637};
638
639static void append_text(const struct buf *text, void *rock)
640{
641    struct text_rock *tr = (struct text_rock *)rock;
642
643    tr->ncalls++;
644    buf_append(&tr->out, text);
645}
646
647#define TESTCASE(in, cs, enc, st, exp) \
648    { \
649        static const char _in[] = (in); \
650        charset_t _cs = charset_lookupname(cs); \
651        CU_ASSERT_PTR_NOT_NULL(_cs); \
652        int _enc = (enc); \
653        static const char _st[] = (st); \
654        static const char _exp[] = (exp); \
655        struct buf bin = BUF_INITIALIZER; \
656        struct text_rock tr; \
657        int r; \
658 \
659        memset(&tr, 0, sizeof(tr)); \
660        buf_init_ro(&bin, _in, sizeof(_in)-1); \
661 \
662        r = charset_extract(append_text, &tr, &bin, _cs, _enc, _st, flags); \
663        CU_ASSERT_EQUAL(r, 1); \
664        CU_ASSERT_EQUAL(tr.ncalls, 1); \
665        CU_ASSERT_STRING_EQUAL(buf_cstring(&tr.out), _exp); \
666 \
667        buf_free(&bin); \
668        buf_free(&tr.out); \
669        charset_free(&_cs); \
670    }
671
672static void test_extract(void)
673{
674    int flags = CHARSET_SKIPDIACRIT | CHARSET_MERGESPACE; /* default */
675    /* data thanks to hipsteripsum.me */
676
677    /* simplest case - no space, plain text is capitalised */
678    TESTCASE("freegan", "us-ascii", ENCODING_NONE, "PLAIN", "FREEGAN");
679
680    /* capitalised text is still capitalised */
681    TESTCASE("FANNY PACK", "us-ascii", ENCODING_NONE, "PLAIN", "FANNY PACK");
682
683    /* single spaces become single spaces */
684    TESTCASE("before they sold out",
685             "us-ascii", ENCODING_NONE, "PLAIN",
686             "BEFORE THEY SOLD OUT");
687
688    /* multiple spaces are squashed to a single spaces */
689    TESTCASE("you    probably \t haven't\r\nheard\t\r\tof them",
690             "us-ascii", ENCODING_NONE, "PLAIN",
691             "YOU PROBABLY HAVEN'T HEARD OF THEM");
692
693    /* invalid UTF-8 bytes become the Replacement character */
694    TESTCASE("a\300b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xC0 */
695             "A"UTF8_REPLACEMENT"B");
696    TESTCASE("a\301b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xC1 */
697             "A"UTF8_REPLACEMENT"B");
698    TESTCASE("a\365b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xF5 */
699             "A"UTF8_REPLACEMENT"B");
700    TESTCASE("a\366b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xF6 */
701             "A"UTF8_REPLACEMENT"B");
702    TESTCASE("a\367b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xF7 */
703             "A"UTF8_REPLACEMENT"B");
704    TESTCASE("a\370b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xF8 */
705             "A"UTF8_REPLACEMENT"B");
706    TESTCASE("a\371b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xF9 */
707             "A"UTF8_REPLACEMENT"B");
708    TESTCASE("a\372b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xFA */
709             "A"UTF8_REPLACEMENT"B");
710    TESTCASE("a\373b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xFB */
711             "A"UTF8_REPLACEMENT"B");
712    TESTCASE("a\374b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xFC */
713             "A"UTF8_REPLACEMENT"B");
714    TESTCASE("a\375b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xFD */
715             "A"UTF8_REPLACEMENT"B");
716    TESTCASE("a\376b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xFE */
717             "A"UTF8_REPLACEMENT"B");
718    TESTCASE("a\377b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xFF */
719             "A"UTF8_REPLACEMENT"B");
720
721    /* ill-formed UTF-8 sequences become the Replacement character */
722
723    /* 2-byte sequence lead byte then a non-continuation byte */
724    TESTCASE("a\302bcd", "us-ascii", ENCODING_NONE, "PLAIN",
725             "A"UTF8_REPLACEMENT"BCD");
726    /* 3-byte sequence lead byte then a non-continuation byte */
727    TESTCASE("a\340bcde", "us-ascii", ENCODING_NONE, "PLAIN",
728             "A"UTF8_REPLACEMENT"BCDE");
729    /* 4-byte sequence lead byte then a non-continuation byte */
730    TESTCASE("a\360bcdef", "us-ascii", ENCODING_NONE, "PLAIN",
731             "A"UTF8_REPLACEMENT"BCDEF");
732    /* unexpected continuation byte */
733    TESTCASE("a\240bc", "us-ascii", ENCODING_NONE, "PLAIN",
734             "A"UTF8_REPLACEMENT"BC");
735
736    /* HTML: correctly formed balanced tag pairs */
737    TESTCASE("<b>Photo</b> <em>booth</em>",
738        "us-ascii", ENCODING_NONE, "HTML",
739        "PHOTO BOOTH");
740
741    /* HTML: unbalanced tags */
742    TESTCASE("<b>American<b> <b>Apparel</b>",
743        "us-ascii", ENCODING_NONE, "HTML",
744        "AMERICAN APPAREL");
745
746    /* HTML: OMITTAG tags with and without end tags */
747    TESTCASE("<hr>Terry<hr> <hr>Richardson</hr>",
748        "us-ascii", ENCODING_NONE, "HTML",
749        " TERRY RICHARDSON ");
750
751    /* HTML: non-phrasing tags are replaced with whitespace */
752    TESTCASE("hella<br>mlkshk",
753        "us-ascii", ENCODING_NONE, "HTML",
754        "HELLA MLKSHK");
755    TESTCASE("godard<br/>synth",
756        "us-ascii", ENCODING_NONE, "HTML",
757        "GODARD SYNTH");
758    TESTCASE("<div>vinyl</div><div>narwhal</div>",
759        "us-ascii", ENCODING_NONE, "HTML",
760        " VINYL NARWHAL ");
761
762    /* HTML: quoted tag parameters */
763    TESTCASE("<a href=\"foo.html\">leggings</a> <img src\"beer.jpg\">gastropub",
764        "us-ascii", ENCODING_NONE, "HTML",
765        "LEGGINGS GASTROPUB");
766
767    /* HTML: unquoted tag parameters */
768    TESTCASE("<a href=foo.html>biodiesel</a> <img srcbeer.jpg>seitan",
769        "us-ascii", ENCODING_NONE, "HTML",
770        "BIODIESEL SEITAN");
771
772    /* HTML: contents of SCRIPT tag */
773    TESTCASE("viral <script>bicycle rights</script>readymade",
774        "us-ascii", ENCODING_NONE, "HTML",
775        "VIRAL READYMADE");
776
777    /* HTML: HTML4 SCRIPT tag with no contents */
778    TESTCASE("cardigan <script type=\"text/javascript\" "
779             "src=\"truffaut.js\"></script>williamsburg",
780        "us-ascii", ENCODING_NONE, "HTML",
781        "CARDIGAN WILLIAMSBURG");
782
783    /* HTML: XHTML SCRIPT empty-element-tag aka self-closing tag */
784    TESTCASE("brunch <script type=\"text/javascript\" "
785             "src=\"cred.js\"/>shoreditch",
786        "us-ascii", ENCODING_NONE, "HTML",
787        "BRUNCH SHOREDITCH");
788
789    /* HTML: contents of STYLE tag */
790    TESTCASE("pickled <style>whatever tumblr</style>stumptown",
791        "us-ascii", ENCODING_NONE, "HTML",
792        "PICKLED STUMPTOWN");
793
794    /* HTML: comments, correctly formed */
795    TESTCASE("pinterest <!-- master cleanse -->forage",
796        "us-ascii", ENCODING_NONE, "HTML",
797        "PINTEREST FORAGE");
798
799    /* HTML: comments correctly formed with embedded -- */
800    TESTCASE("polaroid <!-- food -- truck -->letterpress",
801        "us-ascii", ENCODING_NONE, "HTML",
802        "POLAROID LETTERPRESS");
803
804    /* HTML: comments correctly formed with embedded tags */
805    TESTCASE("semiotics <!-- messenger <hr> bag -->scenester",
806        "us-ascii", ENCODING_NONE, "HTML",
807        "SEMIOTICS SCENESTER");
808
809    /* HTML: comments correctly formed with embedded -> */
810    TESTCASE("butcher <!-- cosby -> sweater -->whatever",
811        "us-ascii", ENCODING_NONE, "HTML",
812        "BUTCHER WHATEVER");
813
814    /* HTML: comments correctly formed with ---> ending */
815    TESTCASE("ennui <!-- art party --->keffiyeh",
816        "us-ascii", ENCODING_NONE, "HTML",
817        "ENNUI KEFFIYEH");
818
819    /* HTML: trivial comment */
820    TESTCASE("street <!-->art",
821        "us-ascii", ENCODING_NONE, "HTML",
822        "STREET ART");
823
824    /* HTML: initial DOCTYPE is ignored */
825    TESTCASE("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" "
826             "\"http://www.w3.org/TR/html4/strict.dtd\">ethnic sustainable",
827        "us-ascii", ENCODING_NONE, "HTML",
828        "ETHNIC SUSTAINABLE");
829
830    /* HTML: simple character references */
831    TESTCASE("&quot;Twee &amp; Keytar&quot; &lt;dreamcatcher@umami.org&gt;",
832        "us-ascii", ENCODING_NONE, "HTML",
833        "\"TWEE & KEYTAR\" <DREAMCATCHER@UMAMI.ORG>");
834
835    /* HTML: naked & is emitted */
836    TESTCASE("gentrify&<b>sartorial</b>",
837        "us-ascii", ENCODING_NONE, "HTML",
838        "GENTRIFY&SARTORIAL");
839
840    /* HTML: non-zero length unterminated entities are emitted */
841    TESTCASE("tattooed&amp locavore",
842        "us-ascii", ENCODING_NONE, "HTML",
843        "TATTOOED& LOCAVORE");
844
845    /* HTML: decimal Unicode entities: U+267B RECYCLE SYMBOL */
846    TESTCASE("odd&#9851;future",
847        "us-ascii", ENCODING_NONE, "HTML",
848        "ODD♻FUTURE");
849
850    /* HTML: hexadecimal Unicode entities: U+2704 SCISSORS */
851    TESTCASE("odd&#x2704;future",
852        "us-ascii", ENCODING_NONE, "HTML",
853        "ODD✄FUTURE");
854
855    /* HTML: compatibility numerical character references */
856    TESTCASE(
857        "A&#128;B&#129;C&#130;D&#131;"
858        "E&#132;F&#133;G&#134;H&#135;"
859        "I&#136;J&#137;K&#138;L&#139;"
860        "M&#140;N&#141;O&#142;P&#143;"
861        "Q&#144;R&#145;S&#146;T&#147;"
862        "U&#148;V&#149;W&#150;X&#151;"
863        "Y&#152;Z&#153;A&#154;B&#155;"
864        "C&#156;D&#157;E&#158;F&#159;g",
865        "us-ascii", ENCODING_NONE, "HTML",
866        "A€BC‚DƑ"  /* ƒ capitalised */
867        "E„F...G†H‡"      /* … normalised to ... */
868        "IˆJ‰KSL‹" /* Š normalised to S */
869        "MŒNOZP"       /* Ž normalised to Z */
870        "QR‘S’T“"
871        "U”V•W–X—"
872        "Y˜ZTMASB›"  /* š capitalised then normalised to S,
873                         * ™ normalised to TM */
874        "CŒDEZFYG")    /* œ capitalised to Œ,
875                           ž capitalised then normalised to Z,
876                         * Ÿ normalised to Y */
877
878    /* HTML: numerical character references to invalid Unicode
879     * codepoints and valid codepoints just adjacent to invalid
880     * ranges.  HTML5 requires us to emit a Replacement char. */
881    TESTCASE("A&#xd7ff;B", "us-ascii", ENCODING_NONE, "HTML", "A\355\237\277B");
882    TESTCASE("A&#xd800;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B");
883    TESTCASE("A&#xd801;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B");
884    TESTCASE("A&#xdffe;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B");
885    TESTCASE("A&#xdfff;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B");
886    TESTCASE("A&#xc000;B", "us-ascii", ENCODING_NONE, "HTML", "A\354\200\200B");
887    TESTCASE("A&#x10fffd;B", "us-ascii", ENCODING_NONE, "HTML", "A\364\217\277\275B");
888    TESTCASE("A&#x110000;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B");
889    TESTCASE("A&#x7fffffff;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B");
890    TESTCASE("A&#xffffffff;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B");
891
892    /* HTML: zero numerical character reference.  The HTML5 spec says
893     * to return a Replacement char. */
894    TESTCASE("A&#0;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B");
895
896    /* HTML: numerical character references whose codepoints the HTML5
897     * spec says are a parse error.  We just silently swallow these. */
898    /* U+0001..U+0008 */
899    TESTCASE("A&#1;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
900    TESTCASE("A&#8;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
901    /* U+000B */
902    TESTCASE("A&#xb;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
903    /* U+000E..U+001F */
904    TESTCASE("A&#xe;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
905    TESTCASE("A&#x1f;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
906    /* U+007F..U+009f, when not a compatibility codepoint */
907    TESTCASE("A&#x7f;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
908    TESTCASE("A&#x81;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
909    TESTCASE("A&#x8D;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
910    TESTCASE("A&#x8F;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
911    TESTCASE("A&#x90;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
912    TESTCASE("A&#x9D;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
913    /* U+FDD0..U+FDEF */
914    TESTCASE("A&#xFDD0;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
915    TESTCASE("A&#xFDEF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
916    /* the last two codepoints in each plane */
917    TESTCASE("A&#xFFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
918    TESTCASE("A&#xFFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
919    TESTCASE("A&#x1FFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
920    TESTCASE("A&#x1FFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
921    TESTCASE("A&#x2FFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
922    TESTCASE("A&#x2FFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
923    TESTCASE("A&#x3FFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
924    TESTCASE("A&#x3FFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
925    TESTCASE("A&#x4FFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
926    TESTCASE("A&#x4FFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
927    TESTCASE("A&#x5FFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
928    TESTCASE("A&#x5FFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
929    TESTCASE("A&#x6FFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
930    TESTCASE("A&#x6FFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
931    TESTCASE("A&#x7FFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
932    TESTCASE("A&#x7FFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
933    TESTCASE("A&#x8FFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
934    TESTCASE("A&#x8FFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
935    TESTCASE("A&#x9FFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
936    TESTCASE("A&#x9FFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
937    TESTCASE("A&#xAFFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
938    TESTCASE("A&#xAFFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
939    TESTCASE("A&#xBFFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
940    TESTCASE("A&#xBFFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
941    TESTCASE("A&#xCFFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
942    TESTCASE("A&#xCFFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
943    TESTCASE("A&#xDFFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
944    TESTCASE("A&#xDFFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
945    TESTCASE("A&#xEFFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
946    TESTCASE("A&#xEFFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
947    TESTCASE("A&#xFFFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
948    TESTCASE("A&#xFFFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
949    TESTCASE("A&#x10FFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
950    TESTCASE("A&#x10FFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
951
952    /* HTML: some of the more obscure named character references.  The
953     * tricky part is testing the case sensitivity and unusual character
954     * generation of the HTML character reference matching code, while
955     * the search normalisation code gets in the way. */
956
957    /* &alpha; and &Alpha; are both defined but both get normalised
958     * to GREEK CAPITAL LETTER ALPHA */
959    TESTCASE("A&alpha;B", "us-ascii", ENCODING_NONE, "HTML", "AΑB");
960    TESTCASE("A&Alpha;B", "us-ascii", ENCODING_NONE, "HTML", "AΑB");
961    /* &clubs; is defined, &Clubs is not */
962    TESTCASE("A&clubs;B", "us-ascii", ENCODING_NONE, "HTML", "A♣B");
963    TESTCASE("A&Clubs;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B");
964    /* &fjlig; is defined to emit a 2-codepoint sequence */
965    TESTCASE("A&fjlig;B", "us-ascii", ENCODING_NONE, "HTML", "AFJB");
966    /* &ycirc; emits a codepoint which is then normalised and capitalised */
967    TESTCASE("A&ycirc;B", "us-ascii", ENCODING_NONE, "HTML", "AYB");
968    /* &uparrow; and &UpArrow; are both defined to the same codepoint,
969     * which survives normalisation intact, but neither &UParrow; nor
970     * &upARROW; are defined.  &Uparrow is defined to a *different*
971     * codepoint which also survives normalisation. */
972    TESTCASE("A&uparrow;B", "us-ascii", ENCODING_NONE, "HTML", "A↑B");
973    TESTCASE("A&Uparrow;B", "us-ascii", ENCODING_NONE, "HTML", "A⇑B");
974    TESTCASE("A&UpArrow;B", "us-ascii", ENCODING_NONE, "HTML", "A↑B");
975    TESTCASE("A&UParrow;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B");
976    TESTCASE("A&upARROW;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B");
977
978    /* &nonesuch; is most definitely not defined */
979    TESTCASE("A&nonesuch;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B");
980
981    /* HTML: Strip HTML from snippet and don't canonify */
982    flags = CHARSET_SNIPPET;
983    TESTCASE("<b>Photo</b> <em>booth</em>", "us-ascii", ENCODING_NONE, "HTML", "Photo booth");
984
985    /* HTML: Strip HTML from snippet, there is nothing to escape */
986    flags = CHARSET_SNIPPET|CHARSET_ESCAPEHTML;
987    TESTCASE("<b>Photo</b>", "us-ascii", ENCODING_NONE, "HTML", "Photo");
988
989    /* PLAIN: Generate snippet, don't escape HTML by default */
990    flags = CHARSET_SNIPPET;
991    TESTCASE("<b>Photo</b>", "us-ascii", ENCODING_NONE, "PLAIN", "<b>Photo</b>");
992
993    /* PLAIN: Generate snippet with escaped HTML */
994    flags = CHARSET_SNIPPET|CHARSET_ESCAPEHTML;
995    TESTCASE("<b>Photo</b>", "us-ascii", ENCODING_NONE, "PLAIN", "&lt;b&gt;Photo&lt;/b&gt;");
996}
997#undef TESTCASE
998
999static void test_utf8_to_searchform(void)
1000{
1001    char *s;
1002
1003    /* LATIN SMALL LETTER I WITH DIAERESIS' (U+00EF) */
1004    static const char UTF8_1[] = "\xC3\xAF";
1005    static const char SEARCH_1[] = "I";
1006
1007    /* LATIN CAPITAL LETTER I WITH DIAERESIS' (U+00CF) */
1008    static const char UTF8_2[] = "\xC3\x8F";
1009    static const char SEARCH_2[] = "I";
1010
1011    /* LATIN CAPITAL LETTER I (U+0049) COMBINING DIAERESIS (U+0308) */
1012    static const char UTF8_3[] = "\x49\xCC\x88";
1013    static const char SEARCH_3[] = "I";
1014
1015    /* LATIN SMALL LETTER I' (U+0069) COMBINING DIAERESIS (U+0408) */
1016    static const char UTF8_4[] = "\x69\xCC\x88";
1017    static const char SEARCH_4[] = "I";
1018
1019    int flags = CHARSET_SKIPDIACRIT | CHARSET_MERGESPACE; /* default */
1020
1021    s = charset_utf8_to_searchform(UTF8_1, flags);
1022    CU_ASSERT_PTR_NOT_NULL(s);
1023    CU_ASSERT_STRING_EQUAL(s, SEARCH_1);
1024    free(s);
1025
1026    s = charset_utf8_to_searchform(UTF8_2, flags);
1027    CU_ASSERT_PTR_NOT_NULL(s);
1028    CU_ASSERT_STRING_EQUAL(s, SEARCH_2);
1029    free(s);
1030
1031    s = charset_utf8_to_searchform(UTF8_3, flags);
1032    CU_ASSERT_PTR_NOT_NULL(s);
1033    CU_ASSERT_STRING_EQUAL(s, SEARCH_3);
1034    free(s);
1035
1036    s = charset_utf8_to_searchform(UTF8_4, flags);
1037    CU_ASSERT_PTR_NOT_NULL(s);
1038    CU_ASSERT_STRING_EQUAL(s, SEARCH_4);
1039    free(s);
1040}
1041
1042static void test_charset_decode(void)
1043{
1044
1045#define TESTCASE(in, inlen, want, wantlen, enc) \
1046    { \
1047        struct buf _dst = BUF_INITIALIZER; \
1048        static const char _in[] = (in); \
1049        static const char _want[] = (want); \
1050        int _enc = (enc); \
1051        size_t _inlen = (inlen); \
1052        size_t _wantlen = (wantlen); \
1053        int _r; \
1054        _r = charset_decode(&_dst, _in, _inlen, _enc); \
1055        CU_ASSERT(_r == 0); \
1056        CU_ASSERT(_dst.len == _wantlen); \
1057        { \
1058            const char *p; \
1059            for(p = _dst.s; p < _dst.s + _dst.len; p++) { \
1060                if (0) fprintf(stderr, "%x\n", *p); \
1061            } \
1062        } \
1063        CU_ASSERT(memcmp(_dst.s, _want, _wantlen) == 0); \
1064    }
1065
1066    TESTCASE("", 0, "", 0, ENCODING_NONE);
1067    TESTCASE("", 0, "", 0, ENCODING_BASE64);
1068    TESTCASE("Hello", 5, "Hello", 5, ENCODING_NONE);
1069    TESTCASE("beefc0de", 8, "\x6d\xe7\x9f\x73\x47\x5e", 6, ENCODING_BASE64);
1070
1071#undef TESTCASE
1072}
1073
1074/* vim: set ft=c: */
1075