#include #include "cunit/cunit.h" #include "charset.h" extern int charset_debug; /* The Unicode Replacement character 0xfffd in UTF-8 encoding */ #define UTF8_REPLACEMENT "\357\277\275" /* The Replacement char after search normalisation */ #define SEARCH_REPLACEMENT "\377" static void test_lookupname(void) { charset_t cs, cs2; /* us-ascii must exist */ cs = charset_lookupname("us-ascii"); CU_ASSERT_PTR_NOT_NULL(cs); /* names are case-insensitive */ cs2 = charset_lookupname("US-ASCII"); CU_ASSERT_PTR_NOT_NULL(cs2); CU_ASSERT_STRING_EQUAL(charset_name(cs), charset_name(cs2)); charset_free(&cs2); cs2 = charset_lookupname("Us-AsCiI"); CU_ASSERT_PTR_NOT_NULL(cs2); CU_ASSERT_STRING_EQUAL(charset_name(cs), charset_name(cs2)); charset_free(&cs2); charset_free(&cs); /* some others must also exist */ cs = charset_lookupname("utf-8"); CU_ASSERT_PTR_NOT_NULL(cs); charset_free(&cs); cs = charset_lookupname("utf-7"); CU_ASSERT_PTR_NOT_NULL(cs); charset_free(&cs); cs = charset_lookupname("iso-8859-1"); CU_ASSERT_PTR_NOT_NULL(cs); charset_free(&cs); /* * Assert that ICU-backed charsets return the Cyrus-canonical name. E.g: * ICU names windows-1252 as ibm-5348_P100-1997 but we don't want that. */ cs = charset_lookupname("windows-1252"); CU_ASSERT_PTR_NOT_NULL(cs); CU_ASSERT_STRING_EQUAL(charset_name(cs), "windows-1252"); charset_free(&cs); cs = charset_lookupname("1252"); CU_ASSERT_PTR_NOT_NULL(cs); CU_ASSERT_STRING_EQUAL(charset_name(cs), "windows-1252"); charset_free(&cs); /* But still use the ICU name if there's no good alias for it */ cs = charset_lookupname("ebcdic-ar"); CU_ASSERT_PTR_NOT_NULL(cs); CU_ASSERT_STRING_EQUAL(charset_name(cs), "ibm-16804_X110-1999"); charset_free(&cs); } static void test_to_utf8(void) { charset_t cs; char *s; static const char ASCII_1[] = "Hello World"; static const char ASCII_2[] = "Hello W\370rld"; static const char UTF8_2[] = "Hello W" UTF8_REPLACEMENT "rld"; static const char BASE64_3[] = "SGVsbG8gV29ybGQ="; static const char QP_4[] = "If you believe that truth=3Dbeauty, then surely=20=\r\n" "mathematics is the most beautiful branch of philosophy.\r\n"; static const char ASCII_4[] = "If you believe that truth=beauty, then surely " "mathematics is the most beautiful branch of philosophy.\r\n"; cs = charset_lookupname("us-ascii"); CU_ASSERT_PTR_NOT_NULL(cs); /* zero length input */ s = charset_to_utf8("", 0, cs, ENCODING_NONE); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, ""); free(s); /* invalid encoding */ s = charset_to_utf8(ASCII_1, sizeof(ASCII_1), cs, 0xdeadbeef); CU_ASSERT_PTR_NULL(s); /* invalid charset */ s = charset_to_utf8(ASCII_1, sizeof(ASCII_1), NULL, ENCODING_NONE); CU_ASSERT_PTR_NULL(s); /* simple ASCII string */ s = charset_to_utf8(ASCII_1, sizeof(ASCII_1)-1, cs, ENCODING_NONE); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, ASCII_1); free(s); /* ASCII string with an invalid character */ s = charset_to_utf8(ASCII_2, sizeof(ASCII_2)-1, cs, ENCODING_NONE); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, UTF8_2); free(s); /* base64 encoding */ s = charset_to_utf8(BASE64_3, sizeof(BASE64_3)-1, cs, ENCODING_BASE64); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, ASCII_1); free(s); /* Quoted-printable encoding */ s = charset_to_utf8(QP_4, sizeof(QP_4)-1, cs, ENCODING_QP); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, ASCII_4); free(s); charset_free(&cs); } static void test_to_imaputf7(void) { charset_t csu8 = charset_lookupname("utf-8"); CU_ASSERT_PTR_NOT_NULL(csu8); charset_t csu7 = charset_lookupname("imap-utf-7"); CU_ASSERT_PTR_NOT_NULL(csu7); #define TESTCASE(in, want) \ { \ char *s; \ char *q; \ static const char _in[] = (in); \ static const char _want[] = (want); \ s = charset_to_imaputf7(_in, strlen(_in), csu8, ENCODING_NONE); \ CU_ASSERT_PTR_NOT_NULL(s); \ CU_ASSERT_STRING_EQUAL(s, _want); \ q = charset_to_utf8(s, strlen(s), csu7, ENCODING_NONE); \ CU_ASSERT_PTR_NOT_NULL(q); \ CU_ASSERT_STRING_EQUAL(q, _in); \ free(q); \ free(s); \ } /* Plain IMAP UTF-7 */ TESTCASE("Hello, World", "Hello, World"); /* Escaped ampersand */ TESTCASE("Laurel&Hardy", "Laurel&-Hardy"); /* LATIN SMALL LETTER O WITH DIAERESIS (U+00F6) */ TESTCASE("Tr""\xC3\xB6""del", "Tr&APY-del"); /* LATIN SMALL LETTER E WITH ACUTE (U+00E9) */ TESTCASE("R""\xC3\xA9""pertoire", "R&AOk-pertoire"); /* WHITE SMILING FACE' (U+263A) */ TESTCASE("Hi Mom \xE2\x98\xBA!", "Hi Mom &Jjo-!"); /* WHITE SMILING FACE' (U+263A) at end */ TESTCASE("Hi Mom \xE2\x98\xBA", "Hi Mom &Jjo-"); /* DESERET SMALL LETTER YEE (U+10437) & HAN Character (U+24B62) */ TESTCASE("\xF0\x90\x90\xB7""&""\xF0\xA4\xAD\xA2", "&2AHcNw-&-&2FLfYg-"); /* CARRIAGE RETURN (CR) (U+000D) LINE FEED (LF) (U+000A) */ TESTCASE("\x0D\x0A", "&AA0ACg-"); #undef TESTCASE charset_free(&csu8); charset_free(&csu7); } static void test_misc_charsets(void) { #define TESTCASE(alias, in, want) \ { \ char *s; \ static const char _in[] = (in); \ static const char _want[] = (want); \ charset_t cs = charset_lookupname(alias); \ CU_ASSERT_PTR_NOT_NULL(cs); \ s = charset_to_utf8(_in, strlen(_in), cs, ENCODING_NONE); \ CU_ASSERT_PTR_NOT_NULL(s); \ CU_ASSERT_STRING_EQUAL(s, _want); \ free(s); \ charset_free(&cs); \ } /* MSDOS Latin1 aka CP-850 */ TESTCASE("cp850", "Hello, World", "Hello, World"); TESTCASE("cp850", "fa""\x87""ade", "fa""\xc3\xa7""ade"); /* Windows-31J aka CP-932 */ TESTCASE("windows-31J", "Hello, World", "Hello, World"); TESTCASE("cp932", "Hello, ""\x90\xa2\x8a\x45", "Hello, ""\xe4\xb8\x96\xe7\x95\x8c"); /* Windows-936 aka CP-936 */ TESTCASE("windows-936", "Hello, World", "Hello, World"); TESTCASE("cp936", "\xC4\xE3\xBA\xC3\xA3\xAC\xCA\xC0\xBD\xE7", "\xE4\xBD\xA0\xE5\xA5\xBD\xEF\xBC\x8C\xE4" "\xB8\x96\xE7\x95\x8C"); /* Windows-1257 aka CP-1257 */ TESTCASE("windows-1257", "Hello, World", "Hello, World"); TESTCASE("cp1257", "\xe0\xd8\xc2", "\xC4\x85\xC5\xB2\xC4\x80"); /* KOI8-U */ TESTCASE("koi8-u", "Hello, World", "Hello, World"); TESTCASE("koi8-u", "\xA4\xA6\xA7\xAD\xB4\xB6\xB7\xBD", "\xD1\x94\xD1\x96\xD1\x97\xD2\x91" "\xD0\x84\xD0\x86\xD0\x87\xD2\x90"); #undef TESTCASE } static void test_qp(void) { /* corner cases in Quoted-Printable */ #define TESTCASE(in, cs, enc, exp) \ { \ static const char _in[] = (in); \ static const char _exp[] = (exp); \ charset_t _cs = charset_lookupname(cs); \ CU_ASSERT_PTR_NOT_NULL(_cs); \ int _enc = (enc); \ char *s = charset_to_utf8(_in, sizeof(_in)-1, _cs, _enc); \ CU_ASSERT_PTR_NOT_NULL(s); \ CU_ASSERT_STRING_EQUAL(s, _exp); \ free(s); \ charset_free(&_cs); \ } /* encoding of SP */ TESTCASE("ab=20xy", "us-ascii", ENCODING_QP, "ab xy"); /* encoding of '=' */ TESTCASE("ab=3Dxy", "us-ascii", ENCODING_QP, "ab=xy"); /* lowercase also */ TESTCASE("ab=3dxy", "us-ascii", ENCODING_QP, "ab=xy"); /* underscore is not special outside of headers */ TESTCASE("ab_xy", "us-ascii", ENCODING_QP, "ab_xy"); /* invalid characters after = are passed through * even if one of them is a valid hexchar */ TESTCASE("ab=ZZxy", "us-ascii", ENCODING_QP, "ab=ZZxy"); TESTCASE("ab=ZCxy", "us-ascii", ENCODING_QP, "ab=ZCxy"); TESTCASE("ab=CZxy", "us-ascii", ENCODING_QP, "ab=CZxy"); TESTCASE("ab=Zcxy", "us-ascii", ENCODING_QP, "ab=Zcxy"); TESTCASE("ab=cZxy", "us-ascii", ENCODING_QP, "ab=cZxy"); /* soft line break */ TESTCASE("ab=\r\nxy", "us-ascii", ENCODING_QP, "abxy"); #undef TESTCASE } static void test_encode_mimeheader(void) { /* corner cases in Quoted-Printable */ #define TESTCASE(in, exp) \ { \ static const char _in[] = (in); \ static const char _exp[] = (exp); \ char *s = charset_encode_mimeheader(_in, 0); \ CU_ASSERT_PTR_NOT_NULL(s); \ CU_ASSERT_STRING_EQUAL(s, _exp); \ const char *p, *lf; \ for (lf = s, p = s; *p != '\0'; p++) { \ if (*p == '\n') { \ CU_ASSERT(p - lf <= 76); \ lf = p; \ } \ } \ CU_ASSERT(p - lf <= 76); \ free(s); \ } TESTCASE("abc", "abc"); TESTCASE("abc\r\n", "=?UTF-8?Q?abc?="); /* bogus indent */ TESTCASE("abc\r\nxyz", "=?UTF-8?Q?abc?=\r\n =?UTF-8?Q?xyz?="); /* wrap */ TESTCASE("abc\r\n xyz", "=?UTF-8?Q?abc?=\r\n =?UTF-8?Q?xyz?="); /* three-byte UTF-8 word barely fits line length limit */ TESTCASE("0123456789012345678901234567890123456789012345678901234\xe2\x82\xac", "=?UTF-8?Q?0123456789012345678901234567890123456789012345678901234=E2=82=AC?="); /* three-byte UTF-8 word must not be split */ TESTCASE("01234567890123456789012345678901234567890123456789012345\xe2\x82\xac", "=?UTF-8?Q?01234567890123456789012345678901234567890123456789012345?=" "\r\n ""=?UTF-8?Q?=E2=82=AC?="); #undef TESTCASE } static void test_decode_mimeheader(void) { char *s; static const char ASCII_1[] = "Lorem IPSUM dolor \t \t sit amet"; static const char SEARCH_1[] = "LOREM IPSUM DOLOR SIT AMET"; static const char ASCII_B64_2[] = "Lorem =?us-ascii?q?ipsum?= dolor " "=?US-ASCII?Q?sit amet?="; static const char ASCII_B64_3[] = "Lorem =?iso-8859-1?q?ips=fcm?= \t" "DOLOR =?iso-8859-1?Q?s=eft am=ebt?="; static const char SEARCH_3[] = "LOREM IPSUM DOLOR SIT AMET"; static const char SEARCH_3b[] = "LOREM IPSÜM DOLOR SÏT AMËT"; static const char SEARCH_3c[] = "LOREMIPSUMDOLORSITAMET"; static const char SEARCH_3d[] = "LOREMIPSÜMDOLORSÏTAMËT"; static const char SEARCH_3e[] = "LOREM IPSÜM DOLOR SÏT AMËT"; static const char HTML_4[] = "=?utf-8?q?=C2=A1Hola,_se=C3=B1or!?="; static const char HTML_4a[] = "¡HOLA, SEN""\xcc\x83""OR!"; static const char HTML_4b[] = "¡HOLA, SENOR!"; static const char HTML_4c[] = "¡Hola, señor!"; static const char HTML_4d[] = "¡<em>Hola</em>, señor!"; int flags = CHARSET_SKIPDIACRIT | CHARSET_MERGESPACE; /* default */ s = charset_decode_mimeheader(NULL, flags); CU_ASSERT_PTR_NULL(s); free(s); s = charset_decode_mimeheader("", flags); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, ""); free(s); s = charset_decode_mimeheader(ASCII_1, flags); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, SEARCH_1); free(s); s = charset_decode_mimeheader(ASCII_B64_2, flags); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, SEARCH_1); free(s); s = charset_decode_mimeheader(ASCII_B64_3, flags); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, SEARCH_3); free(s); flags = CHARSET_MERGESPACE; s = charset_decode_mimeheader(ASCII_B64_3, flags); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, SEARCH_3b); free(s); flags = CHARSET_SKIPSPACE | CHARSET_SKIPDIACRIT; s = charset_decode_mimeheader(ASCII_B64_3, flags); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, SEARCH_3c); free(s); flags = CHARSET_SKIPSPACE; s = charset_decode_mimeheader(ASCII_B64_3, flags); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, SEARCH_3d); free(s); flags = 0; s = charset_decode_mimeheader(ASCII_B64_3, flags); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, SEARCH_3e); free(s); flags = 0; s = charset_decode_mimeheader(HTML_4, flags); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, HTML_4a); free(s); flags = CHARSET_SKIPDIACRIT; s = charset_decode_mimeheader(HTML_4, flags); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, HTML_4b); free(s); flags = CHARSET_SNIPPET; s = charset_decode_mimeheader(HTML_4, flags); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, HTML_4c); free(s); flags = CHARSET_SNIPPET|CHARSET_ESCAPEHTML; s = charset_decode_mimeheader(HTML_4, flags); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, HTML_4d); free(s); static const char ASCII_EUC_KR[] = "A =?EUC-KR?B?wMzIo8Dn?= B"; static const char SEARCH_EUC_KR[] = "A""\x20\xec\x9d\xb4\xed\x98\xb8\xec\x9e\xac""B"; flags = CHARSET_SKIPDIACRIT | CHARSET_MERGESPACE; /* default */ s = charset_decode_mimeheader(ASCII_EUC_KR, flags); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, SEARCH_EUC_KR); free(s); } static void test_parse_mimeheader(void) { char *s; static const char ASCII[] = "Lorem IPSUM"; static const char UTF8[] = "=?utf-8?q?=C2=A1Hola,_se=C3=B1or!?= Lorem IPSÜM"; static const char LATIN1[] = "=?ISO-8859-1?q?Caf=E9?= Lorem IPS""\xDC""M"; static const char UTF8_1[] = "¡Hola, señor! Lorem IPS" UTF8_REPLACEMENT UTF8_REPLACEMENT "M"; static const char UTF8_2[] = "¡Hola, señor! Lorem IPSÜM"; static const char LATIN1_1[] = "Café Lorem IPS" UTF8_REPLACEMENT "M"; int flags = 0; /* default */ s = charset_parse_mimeheader(NULL, flags); CU_ASSERT_PTR_NULL(s); free(s); s = charset_parse_mimeheader("", flags); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, ""); free(s); s = charset_parse_mimeheader(ASCII, flags); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, ASCII); free(s); s = charset_parse_mimeheader(UTF8, flags); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, UTF8_1); free(s); s = charset_parse_mimeheader(LATIN1, flags); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, LATIN1_1); free(s); flags = CHARSET_MIME_UTF8; s = charset_parse_mimeheader(ASCII, flags); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, ASCII); free(s); s = charset_parse_mimeheader(UTF8, flags); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, UTF8_2); free(s); s = charset_parse_mimeheader(LATIN1, flags); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, LATIN1_1); free(s); } static void test_mimeheader_badcharset(void) { /* when given an unknown charset, the entire word is * replaced with a single Unicode replacement char */ char *s; static const char ASCII_1[] = "A =?foo?B?wMzIo8Dn?= B"; static const char SEARCH_1[] = "A " UTF8_REPLACEMENT "B"; int flags = CHARSET_SKIPDIACRIT | CHARSET_MERGESPACE; /* default */ s = charset_decode_mimeheader(ASCII_1, flags); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, SEARCH_1); free(s); } static void test_unfold(void) { #define TESTCASE(in, wantSkip, wantKeep) \ { \ char *s; \ char *k; \ static const char _in[] = (in); \ static const char _wantSkip[] = (wantSkip); \ static const char _wantKeep[] = (wantKeep); \ s = charset_unfold(_in, strlen(_in), CHARSET_UNFOLD_SKIPWS); \ CU_ASSERT_PTR_NOT_NULL(s); \ CU_ASSERT_STRING_EQUAL(s, _wantSkip); \ k = charset_unfold(_in, strlen(_in), 0); \ CU_ASSERT_PTR_NOT_NULL(k); \ CU_ASSERT_STRING_EQUAL(k, _wantKeep); \ free(k); \ free(s); \ } /* Single line */ TESTCASE("abcdef", "abcdef", "abcdef"); /* Single line, ending in CRLF */ TESTCASE("abcdef\r\n", "abcdef", "abcdef"); /* Two lines */ TESTCASE("abc\r\ndef", "abc\r\ndef", "abc\r\ndef"); /* Two lines, first with continuation line */ TESTCASE("ab\r\n c\r\ndef", "abc\r\ndef", "ab c\r\ndef"); /* Two lines, both with continuation lines */ TESTCASE("a\r\n\t\r\n b\r\n c\r\nd\r\n ef", "abc\r\ndef", "a\t b c\r\nd ef"); /* One long, empty continuation line */ /* Typically, RFCs using unfolding forbid this case. */ TESTCASE("\r\n\t\r\n \r\n \r\n", "", "\t "); #undef TESTCASE } static void test_mime_unfold(void) { char *s; /* Test unfolding and the 'keep' space option. Note that 'keep' is * a bit of a misnomer, it actually converts whitespace characters * to SP before keeping the same *number* of chars, which is * actually quite unhelpful. */ s = charset_decode_mimeheader( "From: foo@bar\r\n" "To: baz@quux\r\n" "Subject: this\r\n" "\tline is continued\r\n" "Keywords: and\r\n" "\tso is\r\n" " this one\r\n" "\r\n", CHARSET_SKIPDIACRIT); CU_ASSERT_STRING_EQUAL(s, "FROM: FOO@BAR " "TO: BAZ@QUUX " "SUBJECT: THIS LINE IS CONTINUED " "KEYWORDS: AND SO IS THIS ONE " " " ); free(s); /* test unfolding and the 'merge' space option which merges any * amount of whitespace down to a single SP character */ s = charset_decode_mimeheader( "From: foo@bar\r\n" "To: baz@quux\r\n" "Subject: this\r\n" "\tline is continued\r\n" "Keywords: and\r\n" "\tso is\r\n" " this one\r\n" "\r\n", CHARSET_SKIPDIACRIT|CHARSET_MERGESPACE); CU_ASSERT_STRING_EQUAL(s, "FROM: FOO@BAR " "TO: BAZ@QUUX " "SUBJECT: THIS LINE IS CONTINUED " "KEYWORDS: AND SO IS THIS ONE " ); free(s); /* test unfolding and the 'skip' space option which elides * all whitespace. */ s = charset_decode_mimeheader( "From: foo@bar\r\n" "To: baz@quux\r\n" "Subject: this\r\n" "\tline is continued\r\n" "Keywords: and\r\n" "\tso is\r\n" " this one\r\n" "\r\n", CHARSET_SKIPDIACRIT|CHARSET_SKIPSPACE); CU_ASSERT_STRING_EQUAL(s, "FROM:FOO@BAR" "TO:BAZ@QUUX" "SUBJECT:THISLINEISCONTINUED" "KEYWORDS:ANDSOISTHISONE" ); free(s); } static void test_search_mimeheader(void) { char *s; comp_pat *pat; static const char SUBJECT_CP1252[] = "=?Cp1252?Q?Herzlichen_Gl=FCckwunsch,_der_Artikel_Canon_Ob?= " "=?Cp1252?Q?jektiv_EF-S_18-55_mm_1:3,5-5,6_geh=F6rt_Ihnen!?="; static const char SEARCH_CP1252[] = "Herzlichen"; int flags = CHARSET_SKIPDIACRIT | CHARSET_MERGESPACE; /* default */ charset_t cs = charset_lookupname("us-ascii"); s = charset_convert(SEARCH_CP1252, cs, flags); pat = charset_compilepat(s); CU_ASSERT(charset_search_mimeheader(s, pat, SUBJECT_CP1252, flags)); charset_freepat(pat); charset_free(&cs); free(s); } static void test_rfc5051(void) { /* Example: codepoint U+01C4 (LATIN CAPITAL LETTER DZ WITH CARON) * has a titlecase property of U+01C5 (LATIN CAPITAL LETTER D * WITH SMALL LETTER Z WITH CARON). Codepoint U+01C5 has a * decomposition property of U+0044 (LATIN CAPITAL LETTER D) * U+017E (LATIN SMALL LETTER Z WITH CARON). U+017E has a * decomposition property of U+007A (LATIN SMALL LETTER Z) U+030c */ char *s; static const char STR_RFC5051[] = {0xc7, 0x84, 0}; static const char RES_RFC5051[] = {'D', 'z', 0xcc, 0x8c, 0}; int flags = 0; /* super compliant */ charset_t cs; cs = charset_lookupname("utf-8"); s = charset_convert(STR_RFC5051, cs, flags); CU_ASSERT_PTR_NOT_NULL(s); CU_ASSERT_STRING_EQUAL(s, RES_RFC5051); charset_free(&cs); free(s); } struct text_rock { int ncalls; struct buf out; }; static void append_text(const struct buf *text, void *rock) { struct text_rock *tr = (struct text_rock *)rock; tr->ncalls++; buf_append(&tr->out, text); } #define TESTCASE(in, cs, enc, st, exp) \ { \ static const char _in[] = (in); \ charset_t _cs = charset_lookupname(cs); \ CU_ASSERT_PTR_NOT_NULL(_cs); \ int _enc = (enc); \ static const char _st[] = (st); \ static const char _exp[] = (exp); \ struct buf bin = BUF_INITIALIZER; \ struct text_rock tr; \ int r; \ \ memset(&tr, 0, sizeof(tr)); \ buf_init_ro(&bin, _in, sizeof(_in)-1); \ \ r = charset_extract(append_text, &tr, &bin, _cs, _enc, _st, flags); \ CU_ASSERT_EQUAL(r, 1); \ CU_ASSERT_EQUAL(tr.ncalls, 1); \ CU_ASSERT_STRING_EQUAL(buf_cstring(&tr.out), _exp); \ \ buf_free(&bin); \ buf_free(&tr.out); \ charset_free(&_cs); \ } static void test_extract(void) { int flags = CHARSET_SKIPDIACRIT | CHARSET_MERGESPACE; /* default */ /* data thanks to hipsteripsum.me */ /* simplest case - no space, plain text is capitalised */ TESTCASE("freegan", "us-ascii", ENCODING_NONE, "PLAIN", "FREEGAN"); /* capitalised text is still capitalised */ TESTCASE("FANNY PACK", "us-ascii", ENCODING_NONE, "PLAIN", "FANNY PACK"); /* single spaces become single spaces */ TESTCASE("before they sold out", "us-ascii", ENCODING_NONE, "PLAIN", "BEFORE THEY SOLD OUT"); /* multiple spaces are squashed to a single spaces */ TESTCASE("you probably \t haven't\r\nheard\t\r\tof them", "us-ascii", ENCODING_NONE, "PLAIN", "YOU PROBABLY HAVEN'T HEARD OF THEM"); /* invalid UTF-8 bytes become the Replacement character */ TESTCASE("a\300b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xC0 */ "A"UTF8_REPLACEMENT"B"); TESTCASE("a\301b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xC1 */ "A"UTF8_REPLACEMENT"B"); TESTCASE("a\365b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xF5 */ "A"UTF8_REPLACEMENT"B"); TESTCASE("a\366b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xF6 */ "A"UTF8_REPLACEMENT"B"); TESTCASE("a\367b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xF7 */ "A"UTF8_REPLACEMENT"B"); TESTCASE("a\370b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xF8 */ "A"UTF8_REPLACEMENT"B"); TESTCASE("a\371b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xF9 */ "A"UTF8_REPLACEMENT"B"); TESTCASE("a\372b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xFA */ "A"UTF8_REPLACEMENT"B"); TESTCASE("a\373b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xFB */ "A"UTF8_REPLACEMENT"B"); TESTCASE("a\374b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xFC */ "A"UTF8_REPLACEMENT"B"); TESTCASE("a\375b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xFD */ "A"UTF8_REPLACEMENT"B"); TESTCASE("a\376b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xFE */ "A"UTF8_REPLACEMENT"B"); TESTCASE("a\377b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xFF */ "A"UTF8_REPLACEMENT"B"); /* ill-formed UTF-8 sequences become the Replacement character */ /* 2-byte sequence lead byte then a non-continuation byte */ TESTCASE("a\302bcd", "us-ascii", ENCODING_NONE, "PLAIN", "A"UTF8_REPLACEMENT"BCD"); /* 3-byte sequence lead byte then a non-continuation byte */ TESTCASE("a\340bcde", "us-ascii", ENCODING_NONE, "PLAIN", "A"UTF8_REPLACEMENT"BCDE"); /* 4-byte sequence lead byte then a non-continuation byte */ TESTCASE("a\360bcdef", "us-ascii", ENCODING_NONE, "PLAIN", "A"UTF8_REPLACEMENT"BCDEF"); /* unexpected continuation byte */ TESTCASE("a\240bc", "us-ascii", ENCODING_NONE, "PLAIN", "A"UTF8_REPLACEMENT"BC"); /* HTML: correctly formed balanced tag pairs */ TESTCASE("Photo booth", "us-ascii", ENCODING_NONE, "HTML", "PHOTO BOOTH"); /* HTML: unbalanced tags */ TESTCASE("American Apparel", "us-ascii", ENCODING_NONE, "HTML", "AMERICAN APPAREL"); /* HTML: OMITTAG tags with and without end tags */ TESTCASE("
Terry

Richardson", "us-ascii", ENCODING_NONE, "HTML", " TERRY RICHARDSON "); /* HTML: non-phrasing tags are replaced with whitespace */ TESTCASE("hella
mlkshk", "us-ascii", ENCODING_NONE, "HTML", "HELLA MLKSHK"); TESTCASE("godard
synth", "us-ascii", ENCODING_NONE, "HTML", "GODARD SYNTH"); TESTCASE("
vinyl
narwhal
", "us-ascii", ENCODING_NONE, "HTML", " VINYL NARWHAL "); /* HTML: quoted tag parameters */ TESTCASE("leggings gastropub", "us-ascii", ENCODING_NONE, "HTML", "LEGGINGS GASTROPUB"); /* HTML: unquoted tag parameters */ TESTCASE("biodiesel seitan", "us-ascii", ENCODING_NONE, "HTML", "BIODIESEL SEITAN"); /* HTML: contents of SCRIPT tag */ TESTCASE("viral readymade", "us-ascii", ENCODING_NONE, "HTML", "VIRAL READYMADE"); /* HTML: HTML4 SCRIPT tag with no contents */ TESTCASE("cardigan williamsburg", "us-ascii", ENCODING_NONE, "HTML", "CARDIGAN WILLIAMSBURG"); /* HTML: XHTML SCRIPT empty-element-tag aka self-closing tag */ TESTCASE("brunch