1#include <stdlib.h> 2 3#include "cunit/cunit.h" 4#include "charset.h" 5 6extern int charset_debug; 7 8/* The Unicode Replacement character 0xfffd in UTF-8 encoding */ 9#define UTF8_REPLACEMENT "\357\277\275" 10/* The Replacement char after search normalisation */ 11#define SEARCH_REPLACEMENT "\377" 12 13static void test_lookupname(void) 14{ 15 charset_t cs, cs2; 16 17 /* us-ascii must exist */ 18 cs = charset_lookupname("us-ascii"); 19 CU_ASSERT_PTR_NOT_NULL(cs); 20 21 /* names are case-insensitive */ 22 cs2 = charset_lookupname("US-ASCII"); 23 CU_ASSERT_PTR_NOT_NULL(cs2); 24 CU_ASSERT_STRING_EQUAL(charset_name(cs), charset_name(cs2)); 25 charset_free(&cs2); 26 27 cs2 = charset_lookupname("Us-AsCiI"); 28 CU_ASSERT_PTR_NOT_NULL(cs2); 29 CU_ASSERT_STRING_EQUAL(charset_name(cs), charset_name(cs2)); 30 charset_free(&cs2); 31 charset_free(&cs); 32 33 /* some others must also exist */ 34 cs = charset_lookupname("utf-8"); 35 CU_ASSERT_PTR_NOT_NULL(cs); 36 charset_free(&cs); 37 38 cs = charset_lookupname("utf-7"); 39 CU_ASSERT_PTR_NOT_NULL(cs); 40 charset_free(&cs); 41 42 cs = charset_lookupname("iso-8859-1"); 43 CU_ASSERT_PTR_NOT_NULL(cs); 44 charset_free(&cs); 45 46 /* 47 * Assert that ICU-backed charsets return the Cyrus-canonical name. E.g: 48 * ICU names windows-1252 as ibm-5348_P100-1997 but we don't want that. 49 */ 50 cs = charset_lookupname("windows-1252"); 51 CU_ASSERT_PTR_NOT_NULL(cs); 52 CU_ASSERT_STRING_EQUAL(charset_name(cs), "windows-1252"); 53 charset_free(&cs); 54 55 cs = charset_lookupname("1252"); 56 CU_ASSERT_PTR_NOT_NULL(cs); 57 CU_ASSERT_STRING_EQUAL(charset_name(cs), "windows-1252"); 58 charset_free(&cs); 59 60 /* But still use the ICU name if there's no good alias for it */ 61 cs = charset_lookupname("ebcdic-ar"); 62 CU_ASSERT_PTR_NOT_NULL(cs); 63 CU_ASSERT_STRING_EQUAL(charset_name(cs), "ibm-16804_X110-1999"); 64 charset_free(&cs); 65 66} 67 68static void test_to_utf8(void) 69{ 70 charset_t cs; 71 char *s; 72 static const char ASCII_1[] = "Hello World"; 73 static const char ASCII_2[] = "Hello W\370rld"; 74 static const char UTF8_2[] = "Hello W" UTF8_REPLACEMENT "rld"; 75 static const char BASE64_3[] = "SGVsbG8gV29ybGQ="; 76 static const char QP_4[] = 77"If you believe that truth=3Dbeauty, then surely=20=\r\n" 78"mathematics is the most beautiful branch of philosophy.\r\n"; 79 static const char ASCII_4[] = 80"If you believe that truth=beauty, then surely " 81"mathematics is the most beautiful branch of philosophy.\r\n"; 82 83 cs = charset_lookupname("us-ascii"); 84 CU_ASSERT_PTR_NOT_NULL(cs); 85 86 /* zero length input */ 87 s = charset_to_utf8("", 0, cs, ENCODING_NONE); 88 CU_ASSERT_PTR_NOT_NULL(s); 89 CU_ASSERT_STRING_EQUAL(s, ""); 90 free(s); 91 92 /* invalid encoding */ 93 s = charset_to_utf8(ASCII_1, sizeof(ASCII_1), cs, 0xdeadbeef); 94 CU_ASSERT_PTR_NULL(s); 95 96 /* invalid charset */ 97 s = charset_to_utf8(ASCII_1, sizeof(ASCII_1), NULL, ENCODING_NONE); 98 CU_ASSERT_PTR_NULL(s); 99 100 /* simple ASCII string */ 101 s = charset_to_utf8(ASCII_1, sizeof(ASCII_1)-1, cs, ENCODING_NONE); 102 CU_ASSERT_PTR_NOT_NULL(s); 103 CU_ASSERT_STRING_EQUAL(s, ASCII_1); 104 free(s); 105 106 /* ASCII string with an invalid character */ 107 s = charset_to_utf8(ASCII_2, sizeof(ASCII_2)-1, cs, ENCODING_NONE); 108 CU_ASSERT_PTR_NOT_NULL(s); 109 CU_ASSERT_STRING_EQUAL(s, UTF8_2); 110 free(s); 111 112 /* base64 encoding */ 113 s = charset_to_utf8(BASE64_3, sizeof(BASE64_3)-1, cs, ENCODING_BASE64); 114 CU_ASSERT_PTR_NOT_NULL(s); 115 CU_ASSERT_STRING_EQUAL(s, ASCII_1); 116 free(s); 117 118 /* Quoted-printable encoding */ 119 s = charset_to_utf8(QP_4, sizeof(QP_4)-1, cs, ENCODING_QP); 120 CU_ASSERT_PTR_NOT_NULL(s); 121 CU_ASSERT_STRING_EQUAL(s, ASCII_4); 122 free(s); 123 124 charset_free(&cs); 125} 126 127static void test_to_imaputf7(void) 128{ 129 charset_t csu8 = charset_lookupname("utf-8"); 130 CU_ASSERT_PTR_NOT_NULL(csu8); 131 charset_t csu7 = charset_lookupname("imap-utf-7"); 132 CU_ASSERT_PTR_NOT_NULL(csu7); 133 134#define TESTCASE(in, want) \ 135 { \ 136 char *s; \ 137 char *q; \ 138 static const char _in[] = (in); \ 139 static const char _want[] = (want); \ 140 s = charset_to_imaputf7(_in, strlen(_in), csu8, ENCODING_NONE); \ 141 CU_ASSERT_PTR_NOT_NULL(s); \ 142 CU_ASSERT_STRING_EQUAL(s, _want); \ 143 q = charset_to_utf8(s, strlen(s), csu7, ENCODING_NONE); \ 144 CU_ASSERT_PTR_NOT_NULL(q); \ 145 CU_ASSERT_STRING_EQUAL(q, _in); \ 146 free(q); \ 147 free(s); \ 148 } 149 150 /* Plain IMAP UTF-7 */ 151 TESTCASE("Hello, World", "Hello, World"); 152 153 /* Escaped ampersand */ 154 TESTCASE("Laurel&Hardy", "Laurel&-Hardy"); 155 156 /* LATIN SMALL LETTER O WITH DIAERESIS (U+00F6) */ 157 TESTCASE("Tr""\xC3\xB6""del", "Tr&APY-del"); 158 159 /* LATIN SMALL LETTER E WITH ACUTE (U+00E9) */ 160 TESTCASE("R""\xC3\xA9""pertoire", "R&AOk-pertoire"); 161 162 /* WHITE SMILING FACE' (U+263A) */ 163 TESTCASE("Hi Mom \xE2\x98\xBA!", "Hi Mom &Jjo-!"); 164 165 /* WHITE SMILING FACE' (U+263A) at end */ 166 TESTCASE("Hi Mom \xE2\x98\xBA", "Hi Mom &Jjo-"); 167 168 /* DESERET SMALL LETTER YEE (U+10437) & HAN Character (U+24B62) */ 169 TESTCASE("\xF0\x90\x90\xB7""&""\xF0\xA4\xAD\xA2", "&2AHcNw-&-&2FLfYg-"); 170 171 /* CARRIAGE RETURN (CR) (U+000D) LINE FEED (LF) (U+000A) */ 172 TESTCASE("\x0D\x0A", "&AA0ACg-"); 173 174#undef TESTCASE 175 176 charset_free(&csu8); 177 charset_free(&csu7); 178} 179 180static void test_misc_charsets(void) 181{ 182#define TESTCASE(alias, in, want) \ 183 { \ 184 char *s; \ 185 static const char _in[] = (in); \ 186 static const char _want[] = (want); \ 187 charset_t cs = charset_lookupname(alias); \ 188 CU_ASSERT_PTR_NOT_NULL(cs); \ 189 s = charset_to_utf8(_in, strlen(_in), cs, ENCODING_NONE); \ 190 CU_ASSERT_PTR_NOT_NULL(s); \ 191 CU_ASSERT_STRING_EQUAL(s, _want); \ 192 free(s); \ 193 charset_free(&cs); \ 194 } 195 196 /* MSDOS Latin1 aka CP-850 */ 197 TESTCASE("cp850", "Hello, World", "Hello, World"); 198 TESTCASE("cp850", "fa""\x87""ade", "fa""\xc3\xa7""ade"); 199 200 /* Windows-31J aka CP-932 */ 201 TESTCASE("windows-31J", "Hello, World", "Hello, World"); 202 TESTCASE("cp932", "Hello, ""\x90\xa2\x8a\x45", 203 "Hello, ""\xe4\xb8\x96\xe7\x95\x8c"); 204 205 /* Windows-936 aka CP-936 */ 206 TESTCASE("windows-936", "Hello, World", "Hello, World"); 207 TESTCASE("cp936", "\xC4\xE3\xBA\xC3\xA3\xAC\xCA\xC0\xBD\xE7", 208 "\xE4\xBD\xA0\xE5\xA5\xBD\xEF\xBC\x8C\xE4" 209 "\xB8\x96\xE7\x95\x8C"); 210 211 /* Windows-1257 aka CP-1257 */ 212 TESTCASE("windows-1257", "Hello, World", "Hello, World"); 213 TESTCASE("cp1257", "\xe0\xd8\xc2", "\xC4\x85\xC5\xB2\xC4\x80"); 214 215 /* KOI8-U */ 216 TESTCASE("koi8-u", "Hello, World", "Hello, World"); 217 TESTCASE("koi8-u", "\xA4\xA6\xA7\xAD\xB4\xB6\xB7\xBD", 218 "\xD1\x94\xD1\x96\xD1\x97\xD2\x91" 219 "\xD0\x84\xD0\x86\xD0\x87\xD2\x90"); 220 221#undef TESTCASE 222} 223 224static void test_qp(void) 225{ 226 /* corner cases in Quoted-Printable */ 227#define TESTCASE(in, cs, enc, exp) \ 228 { \ 229 static const char _in[] = (in); \ 230 static const char _exp[] = (exp); \ 231 charset_t _cs = charset_lookupname(cs); \ 232 CU_ASSERT_PTR_NOT_NULL(_cs); \ 233 int _enc = (enc); \ 234 char *s = charset_to_utf8(_in, sizeof(_in)-1, _cs, _enc); \ 235 CU_ASSERT_PTR_NOT_NULL(s); \ 236 CU_ASSERT_STRING_EQUAL(s, _exp); \ 237 free(s); \ 238 charset_free(&_cs); \ 239 } 240 241 /* encoding of SP */ 242 TESTCASE("ab=20xy", "us-ascii", ENCODING_QP, "ab xy"); 243 244 /* encoding of '=' */ 245 TESTCASE("ab=3Dxy", "us-ascii", ENCODING_QP, "ab=xy"); 246 247 /* lowercase also */ 248 TESTCASE("ab=3dxy", "us-ascii", ENCODING_QP, "ab=xy"); 249 250 /* underscore is not special outside of headers */ 251 TESTCASE("ab_xy", "us-ascii", ENCODING_QP, "ab_xy"); 252 253 /* invalid characters after = are passed through 254 * even if one of them is a valid hexchar */ 255 TESTCASE("ab=ZZxy", "us-ascii", ENCODING_QP, "ab=ZZxy"); 256 TESTCASE("ab=ZCxy", "us-ascii", ENCODING_QP, "ab=ZCxy"); 257 TESTCASE("ab=CZxy", "us-ascii", ENCODING_QP, "ab=CZxy"); 258 TESTCASE("ab=Zcxy", "us-ascii", ENCODING_QP, "ab=Zcxy"); 259 TESTCASE("ab=cZxy", "us-ascii", ENCODING_QP, "ab=cZxy"); 260 261 /* soft line break */ 262 TESTCASE("ab=\r\nxy", "us-ascii", ENCODING_QP, "abxy"); 263 264#undef TESTCASE 265} 266 267static void test_encode_mimeheader(void) 268{ 269 /* corner cases in Quoted-Printable */ 270#define TESTCASE(in, exp) \ 271 { \ 272 static const char _in[] = (in); \ 273 static const char _exp[] = (exp); \ 274 char *s = charset_encode_mimeheader(_in, 0); \ 275 CU_ASSERT_PTR_NOT_NULL(s); \ 276 CU_ASSERT_STRING_EQUAL(s, _exp); \ 277 const char *p, *lf; \ 278 for (lf = s, p = s; *p != '\0'; p++) { \ 279 if (*p == '\n') { \ 280 CU_ASSERT(p - lf <= 76); \ 281 lf = p; \ 282 } \ 283 } \ 284 CU_ASSERT(p - lf <= 76); \ 285 free(s); \ 286 } 287 288 TESTCASE("abc", "abc"); 289 290 TESTCASE("abc\r\n", "=?UTF-8?Q?abc?="); 291 292 /* bogus indent */ 293 TESTCASE("abc\r\nxyz", "=?UTF-8?Q?abc?=\r\n =?UTF-8?Q?xyz?="); 294 295 /* wrap */ 296 TESTCASE("abc\r\n xyz", "=?UTF-8?Q?abc?=\r\n =?UTF-8?Q?xyz?="); 297 298 /* three-byte UTF-8 word barely fits line length limit */ 299 TESTCASE("0123456789012345678901234567890123456789012345678901234\xe2\x82\xac", 300 "=?UTF-8?Q?0123456789012345678901234567890123456789012345678901234=E2=82=AC?="); 301 302 /* three-byte UTF-8 word must not be split */ 303 TESTCASE("01234567890123456789012345678901234567890123456789012345\xe2\x82\xac", 304 "=?UTF-8?Q?01234567890123456789012345678901234567890123456789012345?=" 305 "\r\n ""=?UTF-8?Q?=E2=82=AC?="); 306 307#undef TESTCASE 308} 309 310 311static void test_decode_mimeheader(void) 312{ 313 char *s; 314 static const char ASCII_1[] = "Lorem IPSUM dolor \t \t sit amet"; 315 static const char SEARCH_1[] = "LOREM IPSUM DOLOR SIT AMET"; 316 static const char ASCII_B64_2[] = "Lorem =?us-ascii?q?ipsum?= dolor " 317 "=?US-ASCII?Q?sit amet?="; 318 static const char ASCII_B64_3[] = "Lorem =?iso-8859-1?q?ips=fcm?= \t" 319 "DOLOR =?iso-8859-1?Q?s=eft am=ebt?="; 320 static const char SEARCH_3[] = "LOREM IPSUM DOLOR SIT AMET"; 321 static const char SEARCH_3b[] = "LOREM IPSÜM DOLOR SÏT AMËT"; 322 static const char SEARCH_3c[] = "LOREMIPSUMDOLORSITAMET"; 323 static const char SEARCH_3d[] = "LOREMIPSÜMDOLORSÏTAMËT"; 324 static const char SEARCH_3e[] = "LOREM IPSÜM DOLOR SÏT AMËT"; 325 static const char HTML_4[] = "=?utf-8?q?=C2=A1<em>Hola</em>,_se=C3=B1or!?="; 326 static const char HTML_4a[] = "¡<EM>HOLA</EM>, SEN""\xcc\x83""OR!"; 327 static const char HTML_4b[] = "¡<EM>HOLA</EM>, SENOR!"; 328 static const char HTML_4c[] = "¡<em>Hola</em>, señor!"; 329 static const char HTML_4d[] = "¡<em>Hola</em>, señor!"; 330 int flags = CHARSET_SKIPDIACRIT | CHARSET_MERGESPACE; /* default */ 331 332 s = charset_decode_mimeheader(NULL, flags); 333 CU_ASSERT_PTR_NULL(s); 334 free(s); 335 336 s = charset_decode_mimeheader("", flags); 337 CU_ASSERT_PTR_NOT_NULL(s); 338 CU_ASSERT_STRING_EQUAL(s, ""); 339 free(s); 340 341 s = charset_decode_mimeheader(ASCII_1, flags); 342 CU_ASSERT_PTR_NOT_NULL(s); 343 CU_ASSERT_STRING_EQUAL(s, SEARCH_1); 344 free(s); 345 346 s = charset_decode_mimeheader(ASCII_B64_2, flags); 347 CU_ASSERT_PTR_NOT_NULL(s); 348 CU_ASSERT_STRING_EQUAL(s, SEARCH_1); 349 free(s); 350 351 s = charset_decode_mimeheader(ASCII_B64_3, flags); 352 CU_ASSERT_PTR_NOT_NULL(s); 353 CU_ASSERT_STRING_EQUAL(s, SEARCH_3); 354 free(s); 355 356 flags = CHARSET_MERGESPACE; 357 s = charset_decode_mimeheader(ASCII_B64_3, flags); 358 CU_ASSERT_PTR_NOT_NULL(s); 359 CU_ASSERT_STRING_EQUAL(s, SEARCH_3b); 360 free(s); 361 362 flags = CHARSET_SKIPSPACE | CHARSET_SKIPDIACRIT; 363 s = charset_decode_mimeheader(ASCII_B64_3, flags); 364 CU_ASSERT_PTR_NOT_NULL(s); 365 CU_ASSERT_STRING_EQUAL(s, SEARCH_3c); 366 free(s); 367 368 flags = CHARSET_SKIPSPACE; 369 s = charset_decode_mimeheader(ASCII_B64_3, flags); 370 CU_ASSERT_PTR_NOT_NULL(s); 371 CU_ASSERT_STRING_EQUAL(s, SEARCH_3d); 372 free(s); 373 374 flags = 0; 375 s = charset_decode_mimeheader(ASCII_B64_3, flags); 376 CU_ASSERT_PTR_NOT_NULL(s); 377 CU_ASSERT_STRING_EQUAL(s, SEARCH_3e); 378 free(s); 379 380 flags = 0; 381 s = charset_decode_mimeheader(HTML_4, flags); 382 CU_ASSERT_PTR_NOT_NULL(s); 383 CU_ASSERT_STRING_EQUAL(s, HTML_4a); 384 free(s); 385 386 flags = CHARSET_SKIPDIACRIT; 387 s = charset_decode_mimeheader(HTML_4, flags); 388 CU_ASSERT_PTR_NOT_NULL(s); 389 CU_ASSERT_STRING_EQUAL(s, HTML_4b); 390 free(s); 391 392 flags = CHARSET_SNIPPET; 393 s = charset_decode_mimeheader(HTML_4, flags); 394 CU_ASSERT_PTR_NOT_NULL(s); 395 CU_ASSERT_STRING_EQUAL(s, HTML_4c); 396 free(s); 397 398 flags = CHARSET_SNIPPET|CHARSET_ESCAPEHTML; 399 s = charset_decode_mimeheader(HTML_4, flags); 400 CU_ASSERT_PTR_NOT_NULL(s); 401 CU_ASSERT_STRING_EQUAL(s, HTML_4d); 402 free(s); 403 404 static const char ASCII_EUC_KR[] = "A =?EUC-KR?B?wMzIo8Dn?= B"; 405 static const char SEARCH_EUC_KR[] = "A""\x20\xec\x9d\xb4\xed\x98\xb8\xec\x9e\xac""B"; 406 flags = CHARSET_SKIPDIACRIT | CHARSET_MERGESPACE; /* default */ 407 s = charset_decode_mimeheader(ASCII_EUC_KR, flags); 408 CU_ASSERT_PTR_NOT_NULL(s); 409 CU_ASSERT_STRING_EQUAL(s, SEARCH_EUC_KR); 410 free(s); 411} 412 413static void test_parse_mimeheader(void) 414{ 415 char *s; 416 static const char ASCII[] = "Lorem IPSUM"; 417 static const char UTF8[] = "=?utf-8?q?=C2=A1Hola,_se=C3=B1or!?= Lorem IPSÜM"; 418 static const char LATIN1[] = "=?ISO-8859-1?q?Caf=E9?= Lorem IPS""\xDC""M"; 419 420 static const char UTF8_1[] = "¡Hola, señor! Lorem IPS" UTF8_REPLACEMENT UTF8_REPLACEMENT "M"; 421 static const char UTF8_2[] = "¡Hola, señor! Lorem IPSÜM"; 422 static const char LATIN1_1[] = "Café Lorem IPS" UTF8_REPLACEMENT "M"; 423 424 int flags = 0; /* default */ 425 426 s = charset_parse_mimeheader(NULL, flags); 427 CU_ASSERT_PTR_NULL(s); 428 free(s); 429 430 s = charset_parse_mimeheader("", flags); 431 CU_ASSERT_PTR_NOT_NULL(s); 432 CU_ASSERT_STRING_EQUAL(s, ""); 433 free(s); 434 435 s = charset_parse_mimeheader(ASCII, flags); 436 CU_ASSERT_PTR_NOT_NULL(s); 437 CU_ASSERT_STRING_EQUAL(s, ASCII); 438 free(s); 439 440 s = charset_parse_mimeheader(UTF8, flags); 441 CU_ASSERT_PTR_NOT_NULL(s); 442 CU_ASSERT_STRING_EQUAL(s, UTF8_1); 443 free(s); 444 445 s = charset_parse_mimeheader(LATIN1, flags); 446 CU_ASSERT_PTR_NOT_NULL(s); 447 CU_ASSERT_STRING_EQUAL(s, LATIN1_1); 448 free(s); 449 450 flags = CHARSET_MIME_UTF8; 451 452 s = charset_parse_mimeheader(ASCII, flags); 453 CU_ASSERT_PTR_NOT_NULL(s); 454 CU_ASSERT_STRING_EQUAL(s, ASCII); 455 free(s); 456 457 s = charset_parse_mimeheader(UTF8, flags); 458 CU_ASSERT_PTR_NOT_NULL(s); 459 CU_ASSERT_STRING_EQUAL(s, UTF8_2); 460 free(s); 461 462 s = charset_parse_mimeheader(LATIN1, flags); 463 CU_ASSERT_PTR_NOT_NULL(s); 464 CU_ASSERT_STRING_EQUAL(s, LATIN1_1); 465 free(s); 466} 467 468static void test_mimeheader_badcharset(void) 469{ 470 /* when given an unknown charset, the entire word is 471 * replaced with a single Unicode replacement char */ 472 char *s; 473 static const char ASCII_1[] = "A =?foo?B?wMzIo8Dn?= B"; 474 static const char SEARCH_1[] = "A " UTF8_REPLACEMENT "B"; 475 int flags = CHARSET_SKIPDIACRIT | CHARSET_MERGESPACE; /* default */ 476 477 s = charset_decode_mimeheader(ASCII_1, flags); 478 CU_ASSERT_PTR_NOT_NULL(s); 479 CU_ASSERT_STRING_EQUAL(s, SEARCH_1); 480 free(s); 481} 482 483static void test_unfold(void) 484{ 485#define TESTCASE(in, wantSkip, wantKeep) \ 486 { \ 487 char *s; \ 488 char *k; \ 489 static const char _in[] = (in); \ 490 static const char _wantSkip[] = (wantSkip); \ 491 static const char _wantKeep[] = (wantKeep); \ 492 s = charset_unfold(_in, strlen(_in), CHARSET_UNFOLD_SKIPWS); \ 493 CU_ASSERT_PTR_NOT_NULL(s); \ 494 CU_ASSERT_STRING_EQUAL(s, _wantSkip); \ 495 k = charset_unfold(_in, strlen(_in), 0); \ 496 CU_ASSERT_PTR_NOT_NULL(k); \ 497 CU_ASSERT_STRING_EQUAL(k, _wantKeep); \ 498 free(k); \ 499 free(s); \ 500 } 501 502 /* Single line */ 503 TESTCASE("abcdef", "abcdef", "abcdef"); 504 505 /* Single line, ending in CRLF */ 506 TESTCASE("abcdef\r\n", "abcdef", "abcdef"); 507 508 /* Two lines */ 509 TESTCASE("abc\r\ndef", "abc\r\ndef", "abc\r\ndef"); 510 511 /* Two lines, first with continuation line */ 512 TESTCASE("ab\r\n c\r\ndef", "abc\r\ndef", "ab c\r\ndef"); 513 514 /* Two lines, both with continuation lines */ 515 TESTCASE("a\r\n\t\r\n b\r\n c\r\nd\r\n ef", "abc\r\ndef", "a\t b c\r\nd ef"); 516 517 /* One long, empty continuation line */ 518 /* Typically, RFCs using unfolding forbid this case. */ 519 TESTCASE("\r\n\t\r\n \r\n \r\n", "", "\t "); 520 521#undef TESTCASE 522} 523 524static void test_mime_unfold(void) 525{ 526 char *s; 527 528 /* Test unfolding and the 'keep' space option. Note that 'keep' is 529 * a bit of a misnomer, it actually converts whitespace characters 530 * to SP before keeping the same *number* of chars, which is 531 * actually quite unhelpful. 532 */ 533 s = charset_decode_mimeheader( 534"From: foo@bar\r\n" 535"To: baz@quux\r\n" 536"Subject: this\r\n" 537"\tline is continued\r\n" 538"Keywords: and\r\n" 539"\tso is\r\n" 540" this one\r\n" 541"\r\n", 542 CHARSET_SKIPDIACRIT); 543 CU_ASSERT_STRING_EQUAL(s, 544"FROM: FOO@BAR " 545"TO: BAZ@QUUX " 546"SUBJECT: THIS LINE IS CONTINUED " 547"KEYWORDS: AND SO IS THIS ONE " 548" " 549 ); 550 free(s); 551 552 /* test unfolding and the 'merge' space option which merges any 553 * amount of whitespace down to a single SP character */ 554 s = charset_decode_mimeheader( 555"From: foo@bar\r\n" 556"To: baz@quux\r\n" 557"Subject: this\r\n" 558"\tline is continued\r\n" 559"Keywords: and\r\n" 560"\tso is\r\n" 561" this one\r\n" 562"\r\n", 563 CHARSET_SKIPDIACRIT|CHARSET_MERGESPACE); 564 CU_ASSERT_STRING_EQUAL(s, 565"FROM: FOO@BAR " 566"TO: BAZ@QUUX " 567"SUBJECT: THIS LINE IS CONTINUED " 568"KEYWORDS: AND SO IS THIS ONE " 569 ); 570 free(s); 571 572 /* test unfolding and the 'skip' space option which elides 573 * all whitespace. */ 574 s = charset_decode_mimeheader( 575"From: foo@bar\r\n" 576"To: baz@quux\r\n" 577"Subject: this\r\n" 578"\tline is continued\r\n" 579"Keywords: and\r\n" 580"\tso is\r\n" 581" this one\r\n" 582"\r\n", 583 CHARSET_SKIPDIACRIT|CHARSET_SKIPSPACE); 584 CU_ASSERT_STRING_EQUAL(s, 585"FROM:FOO@BAR" 586"TO:BAZ@QUUX" 587"SUBJECT:THISLINEISCONTINUED" 588"KEYWORDS:ANDSOISTHISONE" 589 ); 590 free(s); 591} 592 593static void test_search_mimeheader(void) 594{ 595 char *s; 596 comp_pat *pat; 597 static const char SUBJECT_CP1252[] = "=?Cp1252?Q?Herzlichen_Gl=FCckwunsch,_der_Artikel_Canon_Ob?= " 598 "=?Cp1252?Q?jektiv_EF-S_18-55_mm_1:3,5-5,6_geh=F6rt_Ihnen!?="; 599 static const char SEARCH_CP1252[] = "Herzlichen"; 600 int flags = CHARSET_SKIPDIACRIT | CHARSET_MERGESPACE; /* default */ 601 charset_t cs = charset_lookupname("us-ascii"); 602 603 s = charset_convert(SEARCH_CP1252, cs, flags); 604 pat = charset_compilepat(s); 605 CU_ASSERT(charset_search_mimeheader(s, pat, SUBJECT_CP1252, flags)); 606 charset_freepat(pat); 607 charset_free(&cs); 608 free(s); 609} 610 611static void test_rfc5051(void) 612{ 613 /* Example: codepoint U+01C4 (LATIN CAPITAL LETTER DZ WITH CARON) 614 * has a titlecase property of U+01C5 (LATIN CAPITAL LETTER D 615 * WITH SMALL LETTER Z WITH CARON). Codepoint U+01C5 has a 616 * decomposition property of U+0044 (LATIN CAPITAL LETTER D) 617 * U+017E (LATIN SMALL LETTER Z WITH CARON). U+017E has a 618 * decomposition property of U+007A (LATIN SMALL LETTER Z) U+030c 619 */ 620 char *s; 621 static const char STR_RFC5051[] = {0xc7, 0x84, 0}; 622 static const char RES_RFC5051[] = {'D', 'z', 0xcc, 0x8c, 0}; 623 int flags = 0; /* super compliant */ 624 charset_t cs; 625 626 cs = charset_lookupname("utf-8"); 627 s = charset_convert(STR_RFC5051, cs, flags); 628 CU_ASSERT_PTR_NOT_NULL(s); 629 CU_ASSERT_STRING_EQUAL(s, RES_RFC5051); 630 charset_free(&cs); 631 free(s); 632} 633 634struct text_rock { 635 int ncalls; 636 struct buf out; 637}; 638 639static void append_text(const struct buf *text, void *rock) 640{ 641 struct text_rock *tr = (struct text_rock *)rock; 642 643 tr->ncalls++; 644 buf_append(&tr->out, text); 645} 646 647#define TESTCASE(in, cs, enc, st, exp) \ 648 { \ 649 static const char _in[] = (in); \ 650 charset_t _cs = charset_lookupname(cs); \ 651 CU_ASSERT_PTR_NOT_NULL(_cs); \ 652 int _enc = (enc); \ 653 static const char _st[] = (st); \ 654 static const char _exp[] = (exp); \ 655 struct buf bin = BUF_INITIALIZER; \ 656 struct text_rock tr; \ 657 int r; \ 658 \ 659 memset(&tr, 0, sizeof(tr)); \ 660 buf_init_ro(&bin, _in, sizeof(_in)-1); \ 661 \ 662 r = charset_extract(append_text, &tr, &bin, _cs, _enc, _st, flags); \ 663 CU_ASSERT_EQUAL(r, 1); \ 664 CU_ASSERT_EQUAL(tr.ncalls, 1); \ 665 CU_ASSERT_STRING_EQUAL(buf_cstring(&tr.out), _exp); \ 666 \ 667 buf_free(&bin); \ 668 buf_free(&tr.out); \ 669 charset_free(&_cs); \ 670 } 671 672static void test_extract(void) 673{ 674 int flags = CHARSET_SKIPDIACRIT | CHARSET_MERGESPACE; /* default */ 675 /* data thanks to hipsteripsum.me */ 676 677 /* simplest case - no space, plain text is capitalised */ 678 TESTCASE("freegan", "us-ascii", ENCODING_NONE, "PLAIN", "FREEGAN"); 679 680 /* capitalised text is still capitalised */ 681 TESTCASE("FANNY PACK", "us-ascii", ENCODING_NONE, "PLAIN", "FANNY PACK"); 682 683 /* single spaces become single spaces */ 684 TESTCASE("before they sold out", 685 "us-ascii", ENCODING_NONE, "PLAIN", 686 "BEFORE THEY SOLD OUT"); 687 688 /* multiple spaces are squashed to a single spaces */ 689 TESTCASE("you probably \t haven't\r\nheard\t\r\tof them", 690 "us-ascii", ENCODING_NONE, "PLAIN", 691 "YOU PROBABLY HAVEN'T HEARD OF THEM"); 692 693 /* invalid UTF-8 bytes become the Replacement character */ 694 TESTCASE("a\300b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xC0 */ 695 "A"UTF8_REPLACEMENT"B"); 696 TESTCASE("a\301b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xC1 */ 697 "A"UTF8_REPLACEMENT"B"); 698 TESTCASE("a\365b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xF5 */ 699 "A"UTF8_REPLACEMENT"B"); 700 TESTCASE("a\366b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xF6 */ 701 "A"UTF8_REPLACEMENT"B"); 702 TESTCASE("a\367b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xF7 */ 703 "A"UTF8_REPLACEMENT"B"); 704 TESTCASE("a\370b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xF8 */ 705 "A"UTF8_REPLACEMENT"B"); 706 TESTCASE("a\371b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xF9 */ 707 "A"UTF8_REPLACEMENT"B"); 708 TESTCASE("a\372b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xFA */ 709 "A"UTF8_REPLACEMENT"B"); 710 TESTCASE("a\373b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xFB */ 711 "A"UTF8_REPLACEMENT"B"); 712 TESTCASE("a\374b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xFC */ 713 "A"UTF8_REPLACEMENT"B"); 714 TESTCASE("a\375b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xFD */ 715 "A"UTF8_REPLACEMENT"B"); 716 TESTCASE("a\376b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xFE */ 717 "A"UTF8_REPLACEMENT"B"); 718 TESTCASE("a\377b", "us-ascii", ENCODING_NONE, "PLAIN", /* 0xFF */ 719 "A"UTF8_REPLACEMENT"B"); 720 721 /* ill-formed UTF-8 sequences become the Replacement character */ 722 723 /* 2-byte sequence lead byte then a non-continuation byte */ 724 TESTCASE("a\302bcd", "us-ascii", ENCODING_NONE, "PLAIN", 725 "A"UTF8_REPLACEMENT"BCD"); 726 /* 3-byte sequence lead byte then a non-continuation byte */ 727 TESTCASE("a\340bcde", "us-ascii", ENCODING_NONE, "PLAIN", 728 "A"UTF8_REPLACEMENT"BCDE"); 729 /* 4-byte sequence lead byte then a non-continuation byte */ 730 TESTCASE("a\360bcdef", "us-ascii", ENCODING_NONE, "PLAIN", 731 "A"UTF8_REPLACEMENT"BCDEF"); 732 /* unexpected continuation byte */ 733 TESTCASE("a\240bc", "us-ascii", ENCODING_NONE, "PLAIN", 734 "A"UTF8_REPLACEMENT"BC"); 735 736 /* HTML: correctly formed balanced tag pairs */ 737 TESTCASE("<b>Photo</b> <em>booth</em>", 738 "us-ascii", ENCODING_NONE, "HTML", 739 "PHOTO BOOTH"); 740 741 /* HTML: unbalanced tags */ 742 TESTCASE("<b>American<b> <b>Apparel</b>", 743 "us-ascii", ENCODING_NONE, "HTML", 744 "AMERICAN APPAREL"); 745 746 /* HTML: OMITTAG tags with and without end tags */ 747 TESTCASE("<hr>Terry<hr> <hr>Richardson</hr>", 748 "us-ascii", ENCODING_NONE, "HTML", 749 " TERRY RICHARDSON "); 750 751 /* HTML: non-phrasing tags are replaced with whitespace */ 752 TESTCASE("hella<br>mlkshk", 753 "us-ascii", ENCODING_NONE, "HTML", 754 "HELLA MLKSHK"); 755 TESTCASE("godard<br/>synth", 756 "us-ascii", ENCODING_NONE, "HTML", 757 "GODARD SYNTH"); 758 TESTCASE("<div>vinyl</div><div>narwhal</div>", 759 "us-ascii", ENCODING_NONE, "HTML", 760 " VINYL NARWHAL "); 761 762 /* HTML: quoted tag parameters */ 763 TESTCASE("<a href=\"foo.html\">leggings</a> <img src\"beer.jpg\">gastropub", 764 "us-ascii", ENCODING_NONE, "HTML", 765 "LEGGINGS GASTROPUB"); 766 767 /* HTML: unquoted tag parameters */ 768 TESTCASE("<a href=foo.html>biodiesel</a> <img srcbeer.jpg>seitan", 769 "us-ascii", ENCODING_NONE, "HTML", 770 "BIODIESEL SEITAN"); 771 772 /* HTML: contents of SCRIPT tag */ 773 TESTCASE("viral <script>bicycle rights</script>readymade", 774 "us-ascii", ENCODING_NONE, "HTML", 775 "VIRAL READYMADE"); 776 777 /* HTML: HTML4 SCRIPT tag with no contents */ 778 TESTCASE("cardigan <script type=\"text/javascript\" " 779 "src=\"truffaut.js\"></script>williamsburg", 780 "us-ascii", ENCODING_NONE, "HTML", 781 "CARDIGAN WILLIAMSBURG"); 782 783 /* HTML: XHTML SCRIPT empty-element-tag aka self-closing tag */ 784 TESTCASE("brunch <script type=\"text/javascript\" " 785 "src=\"cred.js\"/>shoreditch", 786 "us-ascii", ENCODING_NONE, "HTML", 787 "BRUNCH SHOREDITCH"); 788 789 /* HTML: contents of STYLE tag */ 790 TESTCASE("pickled <style>whatever tumblr</style>stumptown", 791 "us-ascii", ENCODING_NONE, "HTML", 792 "PICKLED STUMPTOWN"); 793 794 /* HTML: comments, correctly formed */ 795 TESTCASE("pinterest <!-- master cleanse -->forage", 796 "us-ascii", ENCODING_NONE, "HTML", 797 "PINTEREST FORAGE"); 798 799 /* HTML: comments correctly formed with embedded -- */ 800 TESTCASE("polaroid <!-- food -- truck -->letterpress", 801 "us-ascii", ENCODING_NONE, "HTML", 802 "POLAROID LETTERPRESS"); 803 804 /* HTML: comments correctly formed with embedded tags */ 805 TESTCASE("semiotics <!-- messenger <hr> bag -->scenester", 806 "us-ascii", ENCODING_NONE, "HTML", 807 "SEMIOTICS SCENESTER"); 808 809 /* HTML: comments correctly formed with embedded -> */ 810 TESTCASE("butcher <!-- cosby -> sweater -->whatever", 811 "us-ascii", ENCODING_NONE, "HTML", 812 "BUTCHER WHATEVER"); 813 814 /* HTML: comments correctly formed with ---> ending */ 815 TESTCASE("ennui <!-- art party --->keffiyeh", 816 "us-ascii", ENCODING_NONE, "HTML", 817 "ENNUI KEFFIYEH"); 818 819 /* HTML: trivial comment */ 820 TESTCASE("street <!-->art", 821 "us-ascii", ENCODING_NONE, "HTML", 822 "STREET ART"); 823 824 /* HTML: initial DOCTYPE is ignored */ 825 TESTCASE("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" " 826 "\"http://www.w3.org/TR/html4/strict.dtd\">ethnic sustainable", 827 "us-ascii", ENCODING_NONE, "HTML", 828 "ETHNIC SUSTAINABLE"); 829 830 /* HTML: simple character references */ 831 TESTCASE(""Twee & Keytar" <dreamcatcher@umami.org>", 832 "us-ascii", ENCODING_NONE, "HTML", 833 "\"TWEE & KEYTAR\" <DREAMCATCHER@UMAMI.ORG>"); 834 835 /* HTML: naked & is emitted */ 836 TESTCASE("gentrify&<b>sartorial</b>", 837 "us-ascii", ENCODING_NONE, "HTML", 838 "GENTRIFY&SARTORIAL"); 839 840 /* HTML: non-zero length unterminated entities are emitted */ 841 TESTCASE("tattooed& locavore", 842 "us-ascii", ENCODING_NONE, "HTML", 843 "TATTOOED& LOCAVORE"); 844 845 /* HTML: decimal Unicode entities: U+267B RECYCLE SYMBOL */ 846 TESTCASE("odd♻future", 847 "us-ascii", ENCODING_NONE, "HTML", 848 "ODD♻FUTURE"); 849 850 /* HTML: hexadecimal Unicode entities: U+2704 SCISSORS */ 851 TESTCASE("odd✄future", 852 "us-ascii", ENCODING_NONE, "HTML", 853 "ODD✄FUTURE"); 854 855 /* HTML: compatibility numerical character references */ 856 TESTCASE( 857 "A€BC‚Dƒ" 858 "E„F…G†H‡" 859 "IˆJ‰KŠL‹" 860 "MŒNOŽP" 861 "QR‘S’T“" 862 "U”V•W–X—" 863 "Y˜Z™AšB›" 864 "CœDEžFŸg", 865 "us-ascii", ENCODING_NONE, "HTML", 866 "A€BC‚DƑ" /* ƒ capitalised */ 867 "E„F...G†H‡" /* … normalised to ... */ 868 "IˆJ‰KSL‹" /* Š normalised to S */ 869 "MŒNOZP" /* Ž normalised to Z */ 870 "QR‘S’T“" 871 "U”V•W–X—" 872 "Y˜ZTMASB›" /* š capitalised then normalised to S, 873 * ™ normalised to TM */ 874 "CŒDEZFYG") /* œ capitalised to Œ, 875 ž capitalised then normalised to Z, 876 * Ÿ normalised to Y */ 877 878 /* HTML: numerical character references to invalid Unicode 879 * codepoints and valid codepoints just adjacent to invalid 880 * ranges. HTML5 requires us to emit a Replacement char. */ 881 TESTCASE("A퟿B", "us-ascii", ENCODING_NONE, "HTML", "A\355\237\277B"); 882 TESTCASE("A�B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B"); 883 TESTCASE("A�B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B"); 884 TESTCASE("A�B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B"); 885 TESTCASE("A�B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B"); 886 TESTCASE("A쀀B", "us-ascii", ENCODING_NONE, "HTML", "A\354\200\200B"); 887 TESTCASE("A􏿽B", "us-ascii", ENCODING_NONE, "HTML", "A\364\217\277\275B"); 888 TESTCASE("A�B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B"); 889 TESTCASE("A�B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B"); 890 TESTCASE("A�B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B"); 891 892 /* HTML: zero numerical character reference. The HTML5 spec says 893 * to return a Replacement char. */ 894 TESTCASE("A�B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B"); 895 896 /* HTML: numerical character references whose codepoints the HTML5 897 * spec says are a parse error. We just silently swallow these. */ 898 /* U+0001..U+0008 */ 899 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 900 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 901 /* U+000B */ 902 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 903 /* U+000E..U+001F */ 904 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 905 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 906 /* U+007F..U+009f, when not a compatibility codepoint */ 907 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 908 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 909 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 910 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 911 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 912 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 913 /* U+FDD0..U+FDEF */ 914 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 915 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 916 /* the last two codepoints in each plane */ 917 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 918 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 919 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 920 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 921 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 922 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 923 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 924 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 925 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 926 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 927 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 928 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 929 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 930 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 931 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 932 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 933 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 934 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 935 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 936 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 937 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 938 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 939 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 940 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 941 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 942 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 943 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 944 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 945 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 946 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 947 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 948 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 949 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 950 TESTCASE("AB", "us-ascii", ENCODING_NONE, "HTML", "AB"); 951 952 /* HTML: some of the more obscure named character references. The 953 * tricky part is testing the case sensitivity and unusual character 954 * generation of the HTML character reference matching code, while 955 * the search normalisation code gets in the way. */ 956 957 /* α and Α are both defined but both get normalised 958 * to GREEK CAPITAL LETTER ALPHA */ 959 TESTCASE("AαB", "us-ascii", ENCODING_NONE, "HTML", "AΑB"); 960 TESTCASE("AΑB", "us-ascii", ENCODING_NONE, "HTML", "AΑB"); 961 /* ♣ is defined, &Clubs is not */ 962 TESTCASE("A♣B", "us-ascii", ENCODING_NONE, "HTML", "A♣B"); 963 TESTCASE("A&Clubs;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B"); 964 /* fj is defined to emit a 2-codepoint sequence */ 965 TESTCASE("AfjB", "us-ascii", ENCODING_NONE, "HTML", "AFJB"); 966 /* ŷ emits a codepoint which is then normalised and capitalised */ 967 TESTCASE("AŷB", "us-ascii", ENCODING_NONE, "HTML", "AYB"); 968 /* ↑ and ↑ are both defined to the same codepoint, 969 * which survives normalisation intact, but neither &UParrow; nor 970 * &upARROW; are defined. &Uparrow is defined to a *different* 971 * codepoint which also survives normalisation. */ 972 TESTCASE("A↑B", "us-ascii", ENCODING_NONE, "HTML", "A↑B"); 973 TESTCASE("A⇑B", "us-ascii", ENCODING_NONE, "HTML", "A⇑B"); 974 TESTCASE("A↑B", "us-ascii", ENCODING_NONE, "HTML", "A↑B"); 975 TESTCASE("A&UParrow;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B"); 976 TESTCASE("A&upARROW;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B"); 977 978 /* &nonesuch; is most definitely not defined */ 979 TESTCASE("A&nonesuch;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B"); 980 981 /* HTML: Strip HTML from snippet and don't canonify */ 982 flags = CHARSET_SNIPPET; 983 TESTCASE("<b>Photo</b> <em>booth</em>", "us-ascii", ENCODING_NONE, "HTML", "Photo booth"); 984 985 /* HTML: Strip HTML from snippet, there is nothing to escape */ 986 flags = CHARSET_SNIPPET|CHARSET_ESCAPEHTML; 987 TESTCASE("<b>Photo</b>", "us-ascii", ENCODING_NONE, "HTML", "Photo"); 988 989 /* PLAIN: Generate snippet, don't escape HTML by default */ 990 flags = CHARSET_SNIPPET; 991 TESTCASE("<b>Photo</b>", "us-ascii", ENCODING_NONE, "PLAIN", "<b>Photo</b>"); 992 993 /* PLAIN: Generate snippet with escaped HTML */ 994 flags = CHARSET_SNIPPET|CHARSET_ESCAPEHTML; 995 TESTCASE("<b>Photo</b>", "us-ascii", ENCODING_NONE, "PLAIN", "<b>Photo</b>"); 996} 997#undef TESTCASE 998 999static void test_utf8_to_searchform(void) 1000{ 1001 char *s; 1002 1003 /* LATIN SMALL LETTER I WITH DIAERESIS' (U+00EF) */ 1004 static const char UTF8_1[] = "\xC3\xAF"; 1005 static const char SEARCH_1[] = "I"; 1006 1007 /* LATIN CAPITAL LETTER I WITH DIAERESIS' (U+00CF) */ 1008 static const char UTF8_2[] = "\xC3\x8F"; 1009 static const char SEARCH_2[] = "I"; 1010 1011 /* LATIN CAPITAL LETTER I (U+0049) COMBINING DIAERESIS (U+0308) */ 1012 static const char UTF8_3[] = "\x49\xCC\x88"; 1013 static const char SEARCH_3[] = "I"; 1014 1015 /* LATIN SMALL LETTER I' (U+0069) COMBINING DIAERESIS (U+0408) */ 1016 static const char UTF8_4[] = "\x69\xCC\x88"; 1017 static const char SEARCH_4[] = "I"; 1018 1019 int flags = CHARSET_SKIPDIACRIT | CHARSET_MERGESPACE; /* default */ 1020 1021 s = charset_utf8_to_searchform(UTF8_1, flags); 1022 CU_ASSERT_PTR_NOT_NULL(s); 1023 CU_ASSERT_STRING_EQUAL(s, SEARCH_1); 1024 free(s); 1025 1026 s = charset_utf8_to_searchform(UTF8_2, flags); 1027 CU_ASSERT_PTR_NOT_NULL(s); 1028 CU_ASSERT_STRING_EQUAL(s, SEARCH_2); 1029 free(s); 1030 1031 s = charset_utf8_to_searchform(UTF8_3, flags); 1032 CU_ASSERT_PTR_NOT_NULL(s); 1033 CU_ASSERT_STRING_EQUAL(s, SEARCH_3); 1034 free(s); 1035 1036 s = charset_utf8_to_searchform(UTF8_4, flags); 1037 CU_ASSERT_PTR_NOT_NULL(s); 1038 CU_ASSERT_STRING_EQUAL(s, SEARCH_4); 1039 free(s); 1040} 1041 1042static void test_charset_decode(void) 1043{ 1044 1045#define TESTCASE(in, inlen, want, wantlen, enc) \ 1046 { \ 1047 struct buf _dst = BUF_INITIALIZER; \ 1048 static const char _in[] = (in); \ 1049 static const char _want[] = (want); \ 1050 int _enc = (enc); \ 1051 size_t _inlen = (inlen); \ 1052 size_t _wantlen = (wantlen); \ 1053 int _r; \ 1054 _r = charset_decode(&_dst, _in, _inlen, _enc); \ 1055 CU_ASSERT(_r == 0); \ 1056 CU_ASSERT(_dst.len == _wantlen); \ 1057 { \ 1058 const char *p; \ 1059 for(p = _dst.s; p < _dst.s + _dst.len; p++) { \ 1060 if (0) fprintf(stderr, "%x\n", *p); \ 1061 } \ 1062 } \ 1063 CU_ASSERT(memcmp(_dst.s, _want, _wantlen) == 0); \ 1064 } 1065 1066 TESTCASE("", 0, "", 0, ENCODING_NONE); 1067 TESTCASE("", 0, "", 0, ENCODING_BASE64); 1068 TESTCASE("Hello", 5, "Hello", 5, ENCODING_NONE); 1069 TESTCASE("beefc0de", 8, "\x6d\xe7\x9f\x73\x47\x5e", 6, ENCODING_BASE64); 1070 1071#undef TESTCASE 1072} 1073 1074/* vim: set ft=c: */ 1075