cyrus-imapd-3.0.16/cunit/charset.testc

#include <stdlib.h>

#include "cunit/cunit.h"
#include "charset.h"

extern int charset_debug;

/* The Unicode Replacement character 0xfffd in UTF-8 encoding */
#define UTF8_REPLACEMENT    "\357\277\275"
/* The Replacement char after search normalisation */
#define SEARCH_REPLACEMENT  "\377"

static void test_lookupname(void)
{
    charset_t cs, cs2;

    /* us-ascii must exist */
    cs = charset_lookupname("us-ascii");
    CU_ASSERT_PTR_NOT_NULL(cs);

    /* names are case-insensitive */
    cs2 = charset_lookupname("US-ASCII");
    CU_ASSERT_PTR_NOT_NULL(cs2);
    CU_ASSERT_STRING_EQUAL(charset_name(cs), charset_name(cs2));
    charset_free(&cs2);

    cs2 = charset_lookupname("Us-AsCiI");
    CU_ASSERT_PTR_NOT_NULL(cs2);
    CU_ASSERT_STRING_EQUAL(charset_name(cs), charset_name(cs2));
    charset_free(&cs2);
    charset_free(&cs);

    /* some others must also exist */
    cs = charset_lookupname("utf-8");
    CU_ASSERT_PTR_NOT_NULL(cs);
    charset_free(&cs);

    cs = charset_lookupname("utf-7");
    CU_ASSERT_PTR_NOT_NULL(cs);
    charset_free(&cs);

    cs = charset_lookupname("iso-8859-1");
    CU_ASSERT_PTR_NOT_NULL(cs);
    charset_free(&cs);

    /*
     * Assert that ICU-backed charsets return the Cyrus-canonical name. E.g:
     * ICU names windows-1252 as ibm-5348_P100-1997 but we don't want that.
     */
    cs = charset_lookupname("windows-1252");
    CU_ASSERT_PTR_NOT_NULL(cs);
    CU_ASSERT_STRING_EQUAL(charset_name(cs), "windows-1252");
    charset_free(&cs);

    cs = charset_lookupname("1252");
    CU_ASSERT_PTR_NOT_NULL(cs);
    CU_ASSERT_STRING_EQUAL(charset_name(cs), "windows-1252");
    charset_free(&cs);

    /* But still use the ICU name if there's no good alias for it */
    cs = charset_lookupname("ebcdic-ar");
    CU_ASSERT_PTR_NOT_NULL(cs);
    CU_ASSERT_STRING_EQUAL(charset_name(cs), "ibm-16804_X110-1999");
    charset_free(&cs);

}

static void test_to_utf8(void)
{
    charset_t cs;
    char *s;
    static const char ASCII_1[] = "Hello World";
    static const char ASCII_2[] = "Hello W\370rld";
    static const char UTF8_2[] = "Hello W" UTF8_REPLACEMENT "rld";
    static const char BASE64_3[] = "SGVsbG8gV29ybGQ=";
    static const char QP_4[] =
"If you believe that truth=3Dbeauty, then surely=20=\r\n"
"mathematics is the most beautiful branch of philosophy.\r\n";
    static const char ASCII_4[] =
"If you believe that truth=beauty, then surely "
"mathematics is the most beautiful branch of philosophy.\r\n";

    cs = charset_lookupname("us-ascii");
    CU_ASSERT_PTR_NOT_NULL(cs);

    /* zero length input */
    s = charset_to_utf8("", 0, cs, ENCODING_NONE);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, "");
    free(s);

    /* invalid encoding */
    s = charset_to_utf8(ASCII_1, sizeof(ASCII_1), cs, 0xdeadbeef);
    CU_ASSERT_PTR_NULL(s);

    /* invalid charset */
    s = charset_to_utf8(ASCII_1, sizeof(ASCII_1), NULL, ENCODING_NONE);
    CU_ASSERT_PTR_NULL(s);

    /* simple ASCII string */
    s = charset_to_utf8(ASCII_1, sizeof(ASCII_1)-1, cs, ENCODING_NONE);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, ASCII_1);
    free(s);

    /* ASCII string with an invalid character */
    s = charset_to_utf8(ASCII_2, sizeof(ASCII_2)-1, cs, ENCODING_NONE);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, UTF8_2);
    free(s);

    /* base64 encoding */
    s = charset_to_utf8(BASE64_3, sizeof(BASE64_3)-1, cs, ENCODING_BASE64);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, ASCII_1);
    free(s);

    /* Quoted-printable encoding */
    s = charset_to_utf8(QP_4, sizeof(QP_4)-1, cs, ENCODING_QP);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, ASCII_4);
    free(s);

    charset_free(&cs);
}

static void test_to_imaputf7(void)
{
    charset_t csu8 = charset_lookupname("utf-8");
    CU_ASSERT_PTR_NOT_NULL(csu8);
    charset_t csu7 = charset_lookupname("imap-utf-7");
    CU_ASSERT_PTR_NOT_NULL(csu7);

#define TESTCASE(in, want) \
    { \
        char *s; \
        char *q; \
        static const char _in[] = (in); \
        static const char _want[] = (want); \
        s = charset_to_imaputf7(_in, strlen(_in), csu8, ENCODING_NONE); \
        CU_ASSERT_PTR_NOT_NULL(s); \
        CU_ASSERT_STRING_EQUAL(s, _want); \
        q = charset_to_utf8(s, strlen(s), csu7, ENCODING_NONE); \
        CU_ASSERT_PTR_NOT_NULL(q); \
        CU_ASSERT_STRING_EQUAL(q, _in); \
        free(q); \
        free(s); \
    }

    /* Plain IMAP UTF-7 */
    TESTCASE("Hello, World", "Hello, World");

    /* Escaped ampersand */
    TESTCASE("Laurel&Hardy", "Laurel&-Hardy");

    /* LATIN SMALL LETTER O WITH DIAERESIS (U+00F6) */
    TESTCASE("Tr""\xC3\xB6""del", "Tr&APY-del");

    /* LATIN SMALL LETTER E WITH ACUTE (U+00E9) */
    TESTCASE("R""\xC3\xA9""pertoire", "R&AOk-pertoire");

    /* WHITE SMILING FACE' (U+263A) */
    TESTCASE("Hi Mom \xE2\x98\xBA!", "Hi Mom &Jjo-!");

    /* WHITE SMILING FACE' (U+263A) at end */
    TESTCASE("Hi Mom \xE2\x98\xBA", "Hi Mom &Jjo-");

    /* DESERET SMALL LETTER YEE (U+10437) & HAN Character (U+24B62) */
    TESTCASE("\xF0\x90\x90\xB7""&""\xF0\xA4\xAD\xA2", "&2AHcNw-&-&2FLfYg-");

    /* CARRIAGE RETURN (CR) (U+000D) LINE FEED (LF) (U+000A) */
    TESTCASE("\x0D\x0A", "&AA0ACg-");

#undef TESTCASE

    charset_free(&csu8);
    charset_free(&csu7);
}

static void test_misc_charsets(void)
{
#define TESTCASE(alias, in, want) \
    { \
        char *s; \
        static const char _in[] = (in); \
        static const char _want[] = (want); \
        charset_t cs = charset_lookupname(alias); \
        CU_ASSERT_PTR_NOT_NULL(cs); \
        s = charset_to_utf8(_in, strlen(_in), cs, ENCODING_NONE); \
        CU_ASSERT_PTR_NOT_NULL(s); \
        CU_ASSERT_STRING_EQUAL(s, _want); \
        free(s); \
        charset_free(&cs); \
    }

    /* MSDOS Latin1 aka CP-850 */
    TESTCASE("cp850", "Hello, World", "Hello, World");
    TESTCASE("cp850", "fa""\x87""ade", "fa""\xc3\xa7""ade");

    /* Windows-31J aka CP-932 */
    TESTCASE("windows-31J", "Hello, World", "Hello, World");
    TESTCASE("cp932", "Hello, ""\x90\xa2\x8a\x45",
                      "Hello, ""\xe4\xb8\x96\xe7\x95\x8c");

    /* Windows-936 aka CP-936 */
    TESTCASE("windows-936", "Hello, World", "Hello, World");
    TESTCASE("cp936", "\xC4\xE3\xBA\xC3\xA3\xAC\xCA\xC0\xBD\xE7",
                      "\xE4\xBD\xA0\xE5\xA5\xBD\xEF\xBC\x8C\xE4"
                      "\xB8\x96\xE7\x95\x8C");

    /* Windows-1257 aka CP-1257 */
    TESTCASE("windows-1257", "Hello, World", "Hello, World");
    TESTCASE("cp1257", "\xe0\xd8\xc2", "\xC4\x85\xC5\xB2\xC4\x80");

    /* KOI8-U */
    TESTCASE("koi8-u", "Hello, World", "Hello, World");
    TESTCASE("koi8-u", "\xA4\xA6\xA7\xAD\xB4\xB6\xB7\xBD",
                       "\xD1\x94\xD1\x96\xD1\x97\xD2\x91"
                       "\xD0\x84\xD0\x86\xD0\x87\xD2\x90");

#undef TESTCASE
}

static void test_qp(void)
{
    /* corner cases in Quoted-Printable */
#define TESTCASE(in, cs, enc, exp) \
    { \
        static const char _in[] = (in); \
        static const char _exp[] = (exp); \
        charset_t _cs = charset_lookupname(cs); \
        CU_ASSERT_PTR_NOT_NULL(_cs); \
        int _enc = (enc); \
        char *s = charset_to_utf8(_in, sizeof(_in)-1, _cs, _enc); \
        CU_ASSERT_PTR_NOT_NULL(s); \
        CU_ASSERT_STRING_EQUAL(s, _exp); \
        free(s); \
        charset_free(&_cs); \
    }

    /* encoding of SP */
    TESTCASE("ab=20xy", "us-ascii", ENCODING_QP, "ab xy");

    /* encoding of '=' */
    TESTCASE("ab=3Dxy", "us-ascii", ENCODING_QP, "ab=xy");

    /* lowercase also */
    TESTCASE("ab=3dxy", "us-ascii", ENCODING_QP, "ab=xy");

    /* underscore is not special outside of headers */
    TESTCASE("ab_xy", "us-ascii", ENCODING_QP, "ab_xy");

    /* invalid characters after = are passed through
     * even if one of them is a valid hexchar */
    TESTCASE("ab=ZZxy", "us-ascii", ENCODING_QP, "ab=ZZxy");
    TESTCASE("ab=ZCxy", "us-ascii", ENCODING_QP, "ab=ZCxy");
    TESTCASE("ab=CZxy", "us-ascii", ENCODING_QP, "ab=CZxy");
    TESTCASE("ab=Zcxy", "us-ascii", ENCODING_QP, "ab=Zcxy");
    TESTCASE("ab=cZxy", "us-ascii", ENCODING_QP, "ab=cZxy");

    /* soft line break */
    TESTCASE("ab=\r\nxy", "us-ascii", ENCODING_QP, "abxy");

#undef TESTCASE
}

static void test_encode_mimeheader(void)
{
    /* corner cases in Quoted-Printable */
#define TESTCASE(in, exp) \
    { \
        static const char _in[] = (in); \
        static const char _exp[] = (exp); \
        char *s = charset_encode_mimeheader(_in, 0); \
        CU_ASSERT_PTR_NOT_NULL(s); \
        CU_ASSERT_STRING_EQUAL(s, _exp); \
        const char *p, *lf; \
        for (lf = s, p = s; *p != '\0'; p++) { \
            if (*p == '\n') { \
                CU_ASSERT(p - lf <= 76); \
                lf = p; \
            } \
        } \
        CU_ASSERT(p - lf <= 76); \
        free(s); \
    }

    TESTCASE("abc", "abc");

    TESTCASE("abc\r\n", "=?UTF-8?Q?abc?=");

    /* bogus indent */
    TESTCASE("abc\r\nxyz", "=?UTF-8?Q?abc?=\r\n =?UTF-8?Q?xyz?=");

    /* wrap */
    TESTCASE("abc\r\n xyz", "=?UTF-8?Q?abc?=\r\n =?UTF-8?Q?xyz?=");

    /* three-byte UTF-8 word barely fits line length limit */
    TESTCASE("0123456789012345678901234567890123456789012345678901234\xe2\x82\xac",
            "=?UTF-8?Q?0123456789012345678901234567890123456789012345678901234=E2=82=AC?=");

    /* three-byte UTF-8 word must not be split */
    TESTCASE("01234567890123456789012345678901234567890123456789012345\xe2\x82\xac",
            "=?UTF-8?Q?01234567890123456789012345678901234567890123456789012345?="
            "\r\n ""=?UTF-8?Q?=E2=82=AC?=");

#undef TESTCASE
}


static void test_decode_mimeheader(void)
{
    char *s;
    static const char ASCII_1[] = "Lorem IPSUM dolor \t \t  sit amet";
    static const char SEARCH_1[] = "LOREM IPSUM DOLOR SIT AMET";
    static const char ASCII_B64_2[] = "Lorem =?us-ascii?q?ipsum?= dolor "
                                      "=?US-ASCII?Q?sit amet?=";
    static const char ASCII_B64_3[] = "Lorem =?iso-8859-1?q?ips=fcm?= \t"
                                      "DOLOR =?iso-8859-1?Q?s=eft am=ebt?=";
    static const char SEARCH_3[] = "LOREM IPSUM DOLOR SIT AMET";
    static const char SEARCH_3b[] = "LOREM IPSÜM DOLOR SÏT AMËT";
    static const char SEARCH_3c[] = "LOREMIPSUMDOLORSITAMET";
    static const char SEARCH_3d[] = "LOREMIPSÜMDOLORSÏTAMËT";
    static const char SEARCH_3e[] = "LOREM IPSÜM  DOLOR SÏT AMËT";
    static const char HTML_4[] = "=?utf-8?q?=C2=A1<em>Hola</em>,_se=C3=B1or!?=";
    static const char HTML_4a[] = "¡<EM>HOLA</EM>, SEN""\xcc\x83""OR!";
    static const char HTML_4b[] = "¡<EM>HOLA</EM>, SENOR!";
    static const char HTML_4c[] = "¡<em>Hola</em>, señor!";
    static const char HTML_4d[] = "¡&lt;em&gt;Hola&lt;/em&gt;, señor!";
    int flags = CHARSET_SKIPDIACRIT | CHARSET_MERGESPACE; /* default */

    s = charset_decode_mimeheader(NULL, flags);
    CU_ASSERT_PTR_NULL(s);
    free(s);

    s = charset_decode_mimeheader("", flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, "");
    free(s);

    s = charset_decode_mimeheader(ASCII_1, flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, SEARCH_1);
    free(s);

    s = charset_decode_mimeheader(ASCII_B64_2, flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, SEARCH_1);
    free(s);

    s = charset_decode_mimeheader(ASCII_B64_3, flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, SEARCH_3);
    free(s);

    flags = CHARSET_MERGESPACE;
    s = charset_decode_mimeheader(ASCII_B64_3, flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, SEARCH_3b);
    free(s);

    flags = CHARSET_SKIPSPACE | CHARSET_SKIPDIACRIT;
    s = charset_decode_mimeheader(ASCII_B64_3, flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, SEARCH_3c);
    free(s);

    flags = CHARSET_SKIPSPACE;
    s = charset_decode_mimeheader(ASCII_B64_3, flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, SEARCH_3d);
    free(s);

    flags = 0;
    s = charset_decode_mimeheader(ASCII_B64_3, flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, SEARCH_3e);
    free(s);

    flags = 0;
    s = charset_decode_mimeheader(HTML_4, flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, HTML_4a);
    free(s);

    flags = CHARSET_SKIPDIACRIT;
    s = charset_decode_mimeheader(HTML_4, flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, HTML_4b);
    free(s);

    flags = CHARSET_SNIPPET;
    s = charset_decode_mimeheader(HTML_4, flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, HTML_4c);
    free(s);

    flags = CHARSET_SNIPPET|CHARSET_ESCAPEHTML;
    s = charset_decode_mimeheader(HTML_4, flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, HTML_4d);
    free(s);

    static const char ASCII_EUC_KR[] = "A =?EUC-KR?B?wMzIo8Dn?= B";
    static const char SEARCH_EUC_KR[] = "A""\x20\xec\x9d\xb4\xed\x98\xb8\xec\x9e\xac""B";
    flags = CHARSET_SKIPDIACRIT | CHARSET_MERGESPACE; /* default */
    s = charset_decode_mimeheader(ASCII_EUC_KR, flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, SEARCH_EUC_KR);
    free(s);
}

static void test_parse_mimeheader(void)
{
    char *s;
    static const char ASCII[] = "Lorem IPSUM";
    static const char UTF8[] = "=?utf-8?q?=C2=A1Hola,_se=C3=B1or!?= Lorem IPSÜM";
    static const char LATIN1[] = "=?ISO-8859-1?q?Caf=E9?= Lorem IPS""\xDC""M";

    static const char UTF8_1[] = "¡Hola, señor! Lorem IPS" UTF8_REPLACEMENT UTF8_REPLACEMENT "M";
    static const char UTF8_2[] = "¡Hola, señor! Lorem IPSÜM";
    static const char LATIN1_1[] = "Café Lorem IPS" UTF8_REPLACEMENT "M";

    int flags = 0; /* default */

    s = charset_parse_mimeheader(NULL, flags);
    CU_ASSERT_PTR_NULL(s);
    free(s);

    s = charset_parse_mimeheader("", flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, "");
    free(s);

    s = charset_parse_mimeheader(ASCII, flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, ASCII);
    free(s);

    s = charset_parse_mimeheader(UTF8, flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, UTF8_1);
    free(s);

    s = charset_parse_mimeheader(LATIN1, flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, LATIN1_1);
    free(s);

    flags = CHARSET_MIME_UTF8;

    s = charset_parse_mimeheader(ASCII, flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, ASCII);
    free(s);

    s = charset_parse_mimeheader(UTF8, flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, UTF8_2);
    free(s);

    s = charset_parse_mimeheader(LATIN1, flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, LATIN1_1);
    free(s);
}

static void test_mimeheader_badcharset(void)
{
    /* when given an unknown charset, the entire word is
     * replaced with a single Unicode replacement char */
    char *s;
    static const char ASCII_1[] = "A =?foo?B?wMzIo8Dn?= B";
    static const char SEARCH_1[] = "A " UTF8_REPLACEMENT "B";
    int flags = CHARSET_SKIPDIACRIT | CHARSET_MERGESPACE; /* default */

    s = charset_decode_mimeheader(ASCII_1, flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, SEARCH_1);
    free(s);
}

static void test_unfold(void)
{
#define TESTCASE(in, wantSkip, wantKeep) \
    { \
        char *s; \
        char *k; \
        static const char _in[] = (in); \
        static const char _wantSkip[] = (wantSkip); \
        static const char _wantKeep[] = (wantKeep); \
        s = charset_unfold(_in, strlen(_in), CHARSET_UNFOLD_SKIPWS); \
        CU_ASSERT_PTR_NOT_NULL(s); \
        CU_ASSERT_STRING_EQUAL(s, _wantSkip); \
        k = charset_unfold(_in, strlen(_in), 0); \
        CU_ASSERT_PTR_NOT_NULL(k); \
        CU_ASSERT_STRING_EQUAL(k, _wantKeep); \
        free(k); \
        free(s); \
    }

    /* Single line */
    TESTCASE("abcdef", "abcdef", "abcdef");

    /* Single line, ending in CRLF */
    TESTCASE("abcdef\r\n", "abcdef", "abcdef");

    /* Two lines */
    TESTCASE("abc\r\ndef", "abc\r\ndef", "abc\r\ndef");

    /* Two lines, first with continuation line */
    TESTCASE("ab\r\n c\r\ndef", "abc\r\ndef", "ab c\r\ndef");

    /* Two lines, both with continuation lines */
    TESTCASE("a\r\n\t\r\n b\r\n c\r\nd\r\n ef", "abc\r\ndef", "a\t b c\r\nd ef");

    /* One long, empty continuation line */
    /* Typically, RFCs using unfolding forbid this case. */
    TESTCASE("\r\n\t\r\n \r\n \r\n", "", "\t  ");

#undef TESTCASE
}

static void test_mime_unfold(void)
{
    char *s;

    /* Test unfolding and the 'keep' space option.  Note that 'keep' is
     * a bit of a misnomer, it actually converts whitespace characters
     * to SP before keeping the same *number* of chars, which is
     * actually quite unhelpful.
     */
    s = charset_decode_mimeheader(
"From: foo@bar\r\n"
"To: baz@quux\r\n"
"Subject: this\r\n"
"\tline is continued\r\n"
"Keywords: and\r\n"
"\tso is\r\n"
" this one\r\n"
"\r\n",
    CHARSET_SKIPDIACRIT);
    CU_ASSERT_STRING_EQUAL(s,
"FROM: FOO@BAR  "
"TO: BAZ@QUUX  "
"SUBJECT: THIS LINE IS CONTINUED  "
"KEYWORDS: AND SO IS THIS ONE  "
"  "
    );
    free(s);

    /* test unfolding and the 'merge' space option which merges any
     * amount of whitespace down to a single SP character */
    s = charset_decode_mimeheader(
"From: foo@bar\r\n"
"To: baz@quux\r\n"
"Subject: this\r\n"
"\tline is continued\r\n"
"Keywords: and\r\n"
"\tso is\r\n"
" this one\r\n"
"\r\n",
    CHARSET_SKIPDIACRIT|CHARSET_MERGESPACE);
    CU_ASSERT_STRING_EQUAL(s,
"FROM: FOO@BAR "
"TO: BAZ@QUUX "
"SUBJECT: THIS LINE IS CONTINUED "
"KEYWORDS: AND SO IS THIS ONE "
    );
    free(s);

    /* test unfolding and the 'skip' space option which elides
     * all whitespace. */
    s = charset_decode_mimeheader(
"From: foo@bar\r\n"
"To: baz@quux\r\n"
"Subject: this\r\n"
"\tline is continued\r\n"
"Keywords: and\r\n"
"\tso is\r\n"
" this one\r\n"
"\r\n",
    CHARSET_SKIPDIACRIT|CHARSET_SKIPSPACE);
    CU_ASSERT_STRING_EQUAL(s,
"FROM:FOO@BAR"
"TO:BAZ@QUUX"
"SUBJECT:THISLINEISCONTINUED"
"KEYWORDS:ANDSOISTHISONE"
    );
    free(s);
}

static void test_search_mimeheader(void)
{
    char *s;
    comp_pat *pat;
    static const char SUBJECT_CP1252[] = "=?Cp1252?Q?Herzlichen_Gl=FCckwunsch,_der_Artikel_Canon_Ob?= "
                                         "=?Cp1252?Q?jektiv_EF-S_18-55_mm_1:3,5-5,6_geh=F6rt_Ihnen!?=";
    static const char SEARCH_CP1252[] = "Herzlichen";
    int flags = CHARSET_SKIPDIACRIT | CHARSET_MERGESPACE; /* default */
    charset_t cs = charset_lookupname("us-ascii");

    s = charset_convert(SEARCH_CP1252, cs, flags);
    pat = charset_compilepat(s);
    CU_ASSERT(charset_search_mimeheader(s, pat, SUBJECT_CP1252, flags));
    charset_freepat(pat);
    charset_free(&cs);
    free(s);
}

static void test_rfc5051(void)
{
    /* Example: codepoint U+01C4 (LATIN CAPITAL LETTER DZ WITH CARON)
     * has a titlecase property of U+01C5 (LATIN CAPITAL LETTER D
     * WITH SMALL LETTER Z WITH CARON).  Codepoint U+01C5 has a
     * decomposition property of U+0044 (LATIN CAPITAL LETTER D)
     * U+017E (LATIN SMALL LETTER Z WITH CARON).  U+017E has a
     * decomposition property of U+007A (LATIN SMALL LETTER Z) U+030c
     */
    char *s;
    static const char STR_RFC5051[] = {0xc7, 0x84, 0};
    static const char RES_RFC5051[] = {'D', 'z', 0xcc, 0x8c, 0};
    int flags = 0; /* super compliant */
    charset_t cs;

    cs = charset_lookupname("utf-8");
    s = charset_convert(STR_RFC5051, cs, flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, RES_RFC5051);
    charset_free(&cs);
    free(s);
}

struct text_rock {
    int ncalls;
    struct buf out;
};

static void append_text(const struct buf *text, void *rock)
{
    struct text_rock *tr = (struct text_rock *)rock;

    tr->ncalls++;
    buf_append(&tr->out, text);
}

#define TESTCASE(in, cs, enc, st, exp) \
    { \
        static const char _in[] = (in); \
        charset_t _cs = charset_lookupname(cs); \
        CU_ASSERT_PTR_NOT_NULL(_cs); \
        int _enc = (enc); \
        static const char _st[] = (st); \
        static const char _exp[] = (exp); \
        struct buf bin = BUF_INITIALIZER; \
        struct text_rock tr; \
        int r; \
 \
        memset(&tr, 0, sizeof(tr)); \
        buf_init_ro(&bin, _in, sizeof(_in)-1); \
 \
        r = charset_extract(append_text, &tr, &bin, _cs, _enc, _st, flags); \
        CU_ASSERT_EQUAL(r, 1); \
        CU_ASSERT_EQUAL(tr.ncalls, 1); \
        CU_ASSERT_STRING_EQUAL(buf_cstring(&tr.out), _exp); \
 \
        buf_free(&bin); \
        buf_free(&tr.out); \
        charset_free(&_cs); \
    }

static void test_extract(void)
{
    int flags = CHARSET_SKIPDIACRIT | CHARSET_MERGESPACE; /* default */
    /* data thanks to hipsteripsum.me */

    /* simplest case - no space, plain text is capitalised */
    TESTCASE("freegan", "us-ascii", ENCODING_NONE, "PLAIN", "FREEGAN");

    /* capitalised text is still capitalised */
    TESTCASE("FANNY PACK", "us-ascii", ENCODING_NONE, "PLAIN", "FANNY PACK");

    /* single spaces become single spaces */
    TESTCASE("before they sold out",
             "us-ascii", ENCODING_NONE, "PLAIN",
             "BEFORE THEY SOLD OUT");

    /* multiple spaces are squashed to a single spaces */
    TESTCASE("you    probably \t haven't\r\nheard\t\r\tof them",
             "us-ascii", ENCODING_NONE, "PLAIN",
             "YOU PROBABLY HAVEN'T HEARD OF THEM");

    /* invalid UTF-8 bytes become the Replacement character */
    TESTCASE("a\300b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xC0 */
             "A"UTF8_REPLACEMENT"B");
    TESTCASE("a\301b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xC1 */
             "A"UTF8_REPLACEMENT"B");
    TESTCASE("a\365b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xF5 */
             "A"UTF8_REPLACEMENT"B");
    TESTCASE("a\366b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xF6 */
             "A"UTF8_REPLACEMENT"B");
    TESTCASE("a\367b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xF7 */
             "A"UTF8_REPLACEMENT"B");
    TESTCASE("a\370b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xF8 */
             "A"UTF8_REPLACEMENT"B");
    TESTCASE("a\371b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xF9 */
             "A"UTF8_REPLACEMENT"B");
    TESTCASE("a\372b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xFA */
             "A"UTF8_REPLACEMENT"B");
    TESTCASE("a\373b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xFB */
             "A"UTF8_REPLACEMENT"B");
    TESTCASE("a\374b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xFC */
             "A"UTF8_REPLACEMENT"B");
    TESTCASE("a\375b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xFD */
             "A"UTF8_REPLACEMENT"B");
    TESTCASE("a\376b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xFE */
             "A"UTF8_REPLACEMENT"B");
    TESTCASE("a\377b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xFF */
             "A"UTF8_REPLACEMENT"B");

    /* ill-formed UTF-8 sequences become the Replacement character */

    /* 2-byte sequence lead byte then a non-continuation byte */
    TESTCASE("a\302bcd", "us-ascii", ENCODING_NONE, "PLAIN",
             "A"UTF8_REPLACEMENT"BCD");
    /* 3-byte sequence lead byte then a non-continuation byte */
    TESTCASE("a\340bcde", "us-ascii", ENCODING_NONE, "PLAIN",
             "A"UTF8_REPLACEMENT"BCDE");
    /* 4-byte sequence lead byte then a non-continuation byte */
    TESTCASE("a\360bcdef", "us-ascii", ENCODING_NONE, "PLAIN",
             "A"UTF8_REPLACEMENT"BCDEF");
    /* unexpected continuation byte */
    TESTCASE("a\240bc", "us-ascii", ENCODING_NONE, "PLAIN",
             "A"UTF8_REPLACEMENT"BC");

    /* HTML: correctly formed balanced tag pairs */
    TESTCASE("<b>Photo</b> <em>booth</em>",
        "us-ascii", ENCODING_NONE, "HTML",
        "PHOTO BOOTH");

    /* HTML: unbalanced tags */
    TESTCASE("<b>American<b> <b>Apparel</b>",
        "us-ascii", ENCODING_NONE, "HTML",
        "AMERICAN APPAREL");

    /* HTML: OMITTAG tags with and without end tags */
    TESTCASE("<hr>Terry<hr> <hr>Richardson</hr>",
        "us-ascii", ENCODING_NONE, "HTML",
        " TERRY RICHARDSON ");

    /* HTML: non-phrasing tags are replaced with whitespace */
    TESTCASE("hella<br>mlkshk",
        "us-ascii", ENCODING_NONE, "HTML",
        "HELLA MLKSHK");
    TESTCASE("godard<br/>synth",
        "us-ascii", ENCODING_NONE, "HTML",
        "GODARD SYNTH");
    TESTCASE("<div>vinyl</div><div>narwhal</div>",
        "us-ascii", ENCODING_NONE, "HTML",
        " VINYL NARWHAL ");

    /* HTML: quoted tag parameters */
    TESTCASE("<a href=\"foo.html\">leggings</a> <img src\"beer.jpg\">gastropub",
        "us-ascii", ENCODING_NONE, "HTML",
        "LEGGINGS GASTROPUB");

    /* HTML: unquoted tag parameters */
    TESTCASE("<a href=foo.html>biodiesel</a> <img srcbeer.jpg>seitan",
        "us-ascii", ENCODING_NONE, "HTML",
        "BIODIESEL SEITAN");

    /* HTML: contents of SCRIPT tag */
    TESTCASE("viral <script>bicycle rights</script>readymade",
        "us-ascii", ENCODING_NONE, "HTML",
        "VIRAL READYMADE");

    /* HTML: HTML4 SCRIPT tag with no contents */
    TESTCASE("cardigan <script type=\"text/javascript\" "
             "src=\"truffaut.js\"></script>williamsburg",
        "us-ascii", ENCODING_NONE, "HTML",
        "CARDIGAN WILLIAMSBURG");

    /* HTML: XHTML SCRIPT empty-element-tag aka self-closing tag */
    TESTCASE("brunch <script type=\"text/javascript\" "
             "src=\"cred.js\"/>shoreditch",
        "us-ascii", ENCODING_NONE, "HTML",
        "BRUNCH SHOREDITCH");

    /* HTML: contents of STYLE tag */
    TESTCASE("pickled <style>whatever tumblr</style>stumptown",
        "us-ascii", ENCODING_NONE, "HTML",
        "PICKLED STUMPTOWN");

    /* HTML: comments, correctly formed */
    TESTCASE("pinterest <!-- master cleanse -->forage",
        "us-ascii", ENCODING_NONE, "HTML",
        "PINTEREST FORAGE");

    /* HTML: comments correctly formed with embedded -- */
    TESTCASE("polaroid <!-- food -- truck -->letterpress",
        "us-ascii", ENCODING_NONE, "HTML",
        "POLAROID LETTERPRESS");

    /* HTML: comments correctly formed with embedded tags */
    TESTCASE("semiotics <!-- messenger <hr> bag -->scenester",
        "us-ascii", ENCODING_NONE, "HTML",
        "SEMIOTICS SCENESTER");

    /* HTML: comments correctly formed with embedded -> */
    TESTCASE("butcher <!-- cosby -> sweater -->whatever",
        "us-ascii", ENCODING_NONE, "HTML",
        "BUTCHER WHATEVER");

    /* HTML: comments correctly formed with ---> ending */
    TESTCASE("ennui <!-- art party --->keffiyeh",
        "us-ascii", ENCODING_NONE, "HTML",
        "ENNUI KEFFIYEH");

    /* HTML: trivial comment */
    TESTCASE("street <!-->art",
        "us-ascii", ENCODING_NONE, "HTML",
        "STREET ART");

    /* HTML: initial DOCTYPE is ignored */
    TESTCASE("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" "
             "\"http://www.w3.org/TR/html4/strict.dtd\">ethnic sustainable",
        "us-ascii", ENCODING_NONE, "HTML",
        "ETHNIC SUSTAINABLE");

    /* HTML: simple character references */
    TESTCASE("&quot;Twee &amp; Keytar&quot; &lt;dreamcatcher@umami.org&gt;",
        "us-ascii", ENCODING_NONE, "HTML",
        "\"TWEE & KEYTAR\" <DREAMCATCHER@UMAMI.ORG>");

    /* HTML: naked & is emitted */
    TESTCASE("gentrify&<b>sartorial</b>",
        "us-ascii", ENCODING_NONE, "HTML",
        "GENTRIFY&SARTORIAL");

    /* HTML: non-zero length unterminated entities are emitted */
    TESTCASE("tattooed&amp locavore",
        "us-ascii", ENCODING_NONE, "HTML",
        "TATTOOED& LOCAVORE");

    /* HTML: decimal Unicode entities: U+267B RECYCLE SYMBOL */
    TESTCASE("odd&#9851;future",
        "us-ascii", ENCODING_NONE, "HTML",
        "ODD♻FUTURE");

    /* HTML: hexadecimal Unicode entities: U+2704 SCISSORS */
    TESTCASE("odd&#x2704;future",
        "us-ascii", ENCODING_NONE, "HTML",
        "ODD✄FUTURE");

    /* HTML: compatibility numerical character references */
    TESTCASE(
        "A&#128;B&#129;C&#130;D&#131;"
        "E&#132;F&#133;G&#134;H&#135;"
        "I&#136;J&#137;K&#138;L&#139;"
        "M&#140;N&#141;O&#142;P&#143;"
        "Q&#144;R&#145;S&#146;T&#147;"
        "U&#148;V&#149;W&#150;X&#151;"
        "Y&#152;Z&#153;A&#154;B&#155;"
        "C&#156;D&#157;E&#158;F&#159;g",
        "us-ascii", ENCODING_NONE, "HTML",
        "A€BC‚DƑ"  /* ƒ capitalised */
        "E„F...G†H‡"      /* … normalised to ... */
        "IˆJ‰KSL‹" /* Š normalised to S */
        "MŒNOZP"       /* Ž normalised to Z */
        "QR‘S’T“"
        "U”V•W–X—"
        "Y˜ZTMASB›"  /* š capitalised then normalised to S,
                         * ™ normalised to TM */
        "CŒDEZFYG")    /* œ capitalised to Œ,
                           ž capitalised then normalised to Z,
                         * Ÿ normalised to Y */

    /* HTML: numerical character references to invalid Unicode
     * codepoints and valid codepoints just adjacent to invalid
     * ranges.  HTML5 requires us to emit a Replacement char. */
    TESTCASE("A&#xd7ff;B", "us-ascii", ENCODING_NONE, "HTML", "A\355\237\277B");
    TESTCASE("A&#xd800;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B");
    TESTCASE("A&#xd801;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B");
    TESTCASE("A&#xdffe;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B");
    TESTCASE("A&#xdfff;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B");
    TESTCASE("A&#xc000;B", "us-ascii", ENCODING_NONE, "HTML", "A\354\200\200B");
    TESTCASE("A&#x10fffd;B", "us-ascii", ENCODING_NONE, "HTML", "A\364\217\277\275B");
    TESTCASE("A&#x110000;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B");
    TESTCASE("A&#x7fffffff;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B");
    TESTCASE("A&#xffffffff;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B");

    /* HTML: zero numerical character reference.  The HTML5 spec says
     * to return a Replacement char. */
    TESTCASE("A&#0;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B");

    /* HTML: numerical character references whose codepoints the HTML5
     * spec says are a parse error.  We just silently swallow these. */
    /* U+0001..U+0008 */
    TESTCASE("A&#1;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#8;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    /* U+000B */
    TESTCASE("A&#xb;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    /* U+000E..U+001F */
    TESTCASE("A&#xe;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x1f;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    /* U+007F..U+009f, when not a compatibility codepoint */
    TESTCASE("A&#x7f;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x81;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x8D;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x8F;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x90;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x9D;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    /* U+FDD0..U+FDEF */
    TESTCASE("A&#xFDD0;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#xFDEF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    /* the last two codepoints in each plane */
    TESTCASE("A&#xFFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#xFFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x1FFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x1FFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x2FFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x2FFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x3FFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x3FFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x4FFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x4FFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x5FFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x5FFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x6FFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x6FFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x7FFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x7FFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x8FFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x8FFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x9FFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x9FFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#xAFFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#xAFFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#xBFFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#xBFFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#xCFFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#xCFFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#xDFFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#xDFFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#xEFFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#xEFFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#xFFFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#xFFFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x10FFFE;B", "us-ascii", ENCODING_NONE, "HTML", "AB");
    TESTCASE("A&#x10FFFF;B", "us-ascii", ENCODING_NONE, "HTML", "AB");

    /* HTML: some of the more obscure named character references.  The
     * tricky part is testing the case sensitivity and unusual character
     * generation of the HTML character reference matching code, while
     * the search normalisation code gets in the way. */

    /* &alpha; and &Alpha; are both defined but both get normalised
     * to GREEK CAPITAL LETTER ALPHA */
    TESTCASE("A&alpha;B", "us-ascii", ENCODING_NONE, "HTML", "AΑB");
    TESTCASE("A&Alpha;B", "us-ascii", ENCODING_NONE, "HTML", "AΑB");
    /* &clubs; is defined, &Clubs is not */
    TESTCASE("A&clubs;B", "us-ascii", ENCODING_NONE, "HTML", "A♣B");
    TESTCASE("A&Clubs;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B");
    /* &fjlig; is defined to emit a 2-codepoint sequence */
    TESTCASE("A&fjlig;B", "us-ascii", ENCODING_NONE, "HTML", "AFJB");
    /* &ycirc; emits a codepoint which is then normalised and capitalised */
    TESTCASE("A&ycirc;B", "us-ascii", ENCODING_NONE, "HTML", "AYB");
    /* &uparrow; and &UpArrow; are both defined to the same codepoint,
     * which survives normalisation intact, but neither &UParrow; nor
     * &upARROW; are defined.  &Uparrow is defined to a *different*
     * codepoint which also survives normalisation. */
    TESTCASE("A&uparrow;B", "us-ascii", ENCODING_NONE, "HTML", "A↑B");
    TESTCASE("A&Uparrow;B", "us-ascii", ENCODING_NONE, "HTML", "A⇑B");
    TESTCASE("A&UpArrow;B", "us-ascii", ENCODING_NONE, "HTML", "A↑B");
    TESTCASE("A&UParrow;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B");
    TESTCASE("A&upARROW;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B");

    /* &nonesuch; is most definitely not defined */
    TESTCASE("A&nonesuch;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B");

    /* HTML: Strip HTML from snippet and don't canonify */
    flags = CHARSET_SNIPPET;
    TESTCASE("<b>Photo</b> <em>booth</em>", "us-ascii", ENCODING_NONE, "HTML", "Photo booth");

    /* HTML: Strip HTML from snippet, there is nothing to escape */
    flags = CHARSET_SNIPPET|CHARSET_ESCAPEHTML;
    TESTCASE("<b>Photo</b>", "us-ascii", ENCODING_NONE, "HTML", "Photo");

    /* PLAIN: Generate snippet, don't escape HTML by default */
    flags = CHARSET_SNIPPET;
    TESTCASE("<b>Photo</b>", "us-ascii", ENCODING_NONE, "PLAIN", "<b>Photo</b>");

    /* PLAIN: Generate snippet with escaped HTML */
    flags = CHARSET_SNIPPET|CHARSET_ESCAPEHTML;
    TESTCASE("<b>Photo</b>", "us-ascii", ENCODING_NONE, "PLAIN", "&lt;b&gt;Photo&lt;/b&gt;");
}
#undef TESTCASE

static void test_utf8_to_searchform(void)
{
    char *s;

    /* LATIN SMALL LETTER I WITH DIAERESIS' (U+00EF) */
    static const char UTF8_1[] = "\xC3\xAF";
    static const char SEARCH_1[] = "I";

    /* LATIN CAPITAL LETTER I WITH DIAERESIS' (U+00CF) */
    static const char UTF8_2[] = "\xC3\x8F";
    static const char SEARCH_2[] = "I";

    /* LATIN CAPITAL LETTER I (U+0049) COMBINING DIAERESIS (U+0308) */
    static const char UTF8_3[] = "\x49\xCC\x88";
    static const char SEARCH_3[] = "I";

    /* LATIN SMALL LETTER I' (U+0069) COMBINING DIAERESIS (U+0408) */
    static const char UTF8_4[] = "\x69\xCC\x88";
    static const char SEARCH_4[] = "I";

    int flags = CHARSET_SKIPDIACRIT | CHARSET_MERGESPACE; /* default */

    s = charset_utf8_to_searchform(UTF8_1, flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, SEARCH_1);
    free(s);

    s = charset_utf8_to_searchform(UTF8_2, flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, SEARCH_2);
    free(s);

    s = charset_utf8_to_searchform(UTF8_3, flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, SEARCH_3);
    free(s);

    s = charset_utf8_to_searchform(UTF8_4, flags);
    CU_ASSERT_PTR_NOT_NULL(s);
    CU_ASSERT_STRING_EQUAL(s, SEARCH_4);
    free(s);
}

static void test_charset_decode(void)
{

#define TESTCASE(in, inlen, want, wantlen, enc) \
    { \
        struct buf _dst = BUF_INITIALIZER; \
        static const char _in[] = (in); \
        static const char _want[] = (want); \
        int _enc = (enc); \
        size_t _inlen = (inlen); \
        size_t _wantlen = (wantlen); \
        int _r; \
        _r = charset_decode(&_dst, _in, _inlen, _enc); \
        CU_ASSERT(_r == 0); \
        CU_ASSERT(_dst.len == _wantlen); \
        { \
            const char *p; \
            for(p = _dst.s; p < _dst.s + _dst.len; p++) { \
                if (0) fprintf(stderr, "%x\n", *p); \
            } \
        } \
        CU_ASSERT(memcmp(_dst.s, _want, _wantlen) == 0); \
    }

    TESTCASE("", 0, "", 0, ENCODING_NONE);
    TESTCASE("", 0, "", 0, ENCODING_BASE64);
    TESTCASE("Hello", 5, "Hello", 5, ENCODING_NONE);
    TESTCASE("beefc0de", 8, "\x6d\xe7\x9f\x73\x47\x5e", 6, ENCODING_BASE64);

#undef TESTCASE
}

/* vim: set ft=c: */