tthsum/tthsum/utf8_test.c

/* vim: set ts=8 sts=4 sw=4 tw=80 noet: */
/*======================================================================
Copyright (C) 2004,2005,2009 Walter Doekes <walter+tthsum@wjd.nu>
This file is part of tthsum.

tthsum is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

tthsum is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with tthsum.  If not, see <http://www.gnu.org/licenses/>.
======================================================================*/
#include "utf8.h"

#include "test.h"
#include <stdlib.h>
#include <string.h>
#include <wchar.h>


static int symcmpbin(const char* name, const char* utf8,
	const wchar_t* unicode) {
    char utf8buf[2048];
    char* utf8alloc;
    wchar_t unicodebuf[2048];
    wchar_t* unicodealloc;
    unsigned utf8len = strlen(utf8);
    unsigned unicodelen = wcslen(unicode);
    unsigned ret;
    unsigned allocsuccess;
    memset(utf8buf, 85, 2048);
    memset(unicodebuf, 85, 2048);

    if (utf8len >= 2048 || unicodelen >= 512)
	FAIL3("Test values too large: \"%s\" is %u and %u bytes", name,
		utf8len, unicodelen);
    if ((ret = (unsigned)utf8towcs(unicodebuf, utf8, 2048)) != unicodelen)
	FAIL3("utf8towcs returned wrong length for \"%s\": %u instead of %u",
		name, ret, unicodelen);
    if ((ret = (unsigned)wcstoutf8(utf8buf, unicode, 2048)) != utf8len)
	FAIL3("wcstoutf8 returned wrong length for \"%s\": %u instead of %u",
		name, ret, utf8len);
    if ((ret = (unsigned)utf8toawcs(&unicodealloc, utf8)) != unicodelen)
	FAIL3("utf8toawcs returned wrong length for \"%s\": %u instead of %u",
		name, ret, unicodelen);
    if ((ret = (unsigned)wcstoautf8(&utf8alloc, unicode)) != utf8len) {
	free(unicodealloc);
	FAIL3("wcstoautf8 returned wrong length for \"%s\": %u instead of %u",
		name, ret, utf8len);
    }

    allocsuccess = memcmp(utf8, utf8alloc, utf8len + 1) == 0 && memcmp(unicode,
	    unicodealloc, (unicodelen + 1) * sizeof(wchar_t)) == 0;
    free(utf8alloc);
    free(unicodealloc);

    TEST_PASS3(memcmp(utf8, utf8buf, utf8len + 1) == 0,
	    "utf8 strings mismatch on \"%s\", first chars 0x%02hx 0x%02hx",
	    name, (short)utf8buf[0], (short)utf8buf[1]); /* %hhx is not C90 */
    TEST_PASS2(memcmp(unicode, unicodebuf, (unicodelen + 1) * sizeof(wchar_t))
	    == 0, "unicode strings mismatch on \"%s\", first char 0x%x",
	    name, (unsigned)unicodebuf[0]);
    TEST_PASS(allocsuccess, "utf8 and/or unicode strings allocated by the "
	    "*toa* variants failed");
    return 0;
}

static int symcmp(const char* utf8, const wchar_t* unicode) {
    return symcmpbin(utf8, utf8, unicode);
}

static int shortutf8towcs(const char* utf8) {
    size_t ret;
    wchar_t buf[5];
    wchar_t* abuf;
    ret = utf8towcs(buf, utf8, 5);
    if (ret == (size_t)-1)
	FAIL1("utf8towcs returned -1 on \"%s\" with a 6 char buffer", utf8);
    if (ret != 5)
	FAIL2("utf8towcs returned %u on \"%s\" with a 6 char buffer",
		(unsigned)ret, utf8);
    if (utf8toawcs(&abuf, utf8) == (size_t)-1)
	FAIL1("failed to get an auto-alloc'd comparison sample for \"%s\"",
		utf8);
    if (memcmp(abuf, buf, 5) != 0) {
	free(abuf);
	FAIL1("short utf8towcs didn't compare with full sample for \"%s\"",
		utf8);
    }
    free(abuf);
    return 0;
}

static int test_wchar_size() {
#ifdef _WIN32
    TEST_PASS1(sizeof(wchar_t) == 2,
	    "expected wchar_t to be 2 bytes on windows, got %u instead",
	    (unsigned)sizeof(wchar_t));
#else /* !_WIN32 */
    TEST_PASS1(sizeof(wchar_t) == 4,
	    "expected wchar_t to be 4 bytes on this OS, got %u instead",
	    (unsigned)sizeof(wchar_t));
#endif /* !_WIN32 */
    return 0;
}

static int test_bidirectional_ascii() {
    return symcmp("", L"")
	 + symcmp("0123ABCDabcd", L"0123ABCDabcd")
	 + symcmpbin("(low ascii)", "\x01\x02\x03\x7d\x7e\x7f",
	    L"\x01\x02\x03\x7d\x7e\x7f");
}

static int test_bidirectional_asian() {
    wchar_t hiragana[] = {0x306a,0x308b,0x3068,0x0};
    return symcmpbin("Hiragana na-ru-to",
	    "\xe3\x81\xaa\xe3\x82\x8b\xe3\x81\xa8", hiragana);
}

static int test_bidirectional_european() {
    wchar_t dutch[] = {0xe4,0xeb,0xef,0xf6,0xfc,0xff,0xc4,0xcb,0xcf,0xd6,0xdc,
		       0x178,0x0};
    wchar_t french[] = {0xe1,0xe2,0xe0,0xc1,0xc2,0xc0,0x0};
    wchar_t swedish[] = {0xe5,0xe4,0xf6,0xc5,0xc4,0xd6,0x0};
    return symcmpbin("Dutch aeiouy with trema",
	    "\xc3\xa4\xc3\xab\xc3\xaf\xc3\xb6\xc3\xbc\xc3\xbf"
	    "\xc3\x84\xc3\x8b\xc3\x8f\xc3\x96\xc3\x9c\xc5\xb8", dutch)
	 + symcmpbin("French several a's",
	    "\xc3\xa1\xc3\xa2\xc3\xa0\xc3\x81\xc3\x82\xc3\x80", french)
	 + symcmpbin("Swedish ao/ae/oe",
	    "\xc3\xa5\xc3\xa4\xc3\xb6\xc3\x85\xc3\x84\xc3\x96", swedish);
}

static int test_invalid_character_handling() {
    wchar_t unicode[2048];
    char utf8[2048];
    wchar_t illegal_surrogates1[] = {0x1,0xdc00,0xd800,0x1,0x0};
    wchar_t illegal_surrogates2[] = {0x1,0xd800,0xd800,0x1,0x0};
    wchar_t illegal_surrogates3[] = {0x1,0xd800,0x1,0xdc00,0x0};
#ifndef _WIN32
    wchar_t high_unicode[] = {0x1,0xffff,0x200000,0xffff,0x1,0x0};
#endif /* _WIN32 */

    TEST_PASS(utf8towcs(unicode, "abc" "\xe4" "def", 2048) == (size_t)-1,
	    "utf8towcs should fail on latin1 swedish ae (0xE4)");
    TEST_PASS(utf8towcs(unicode, "\xc1\xff", 2048) == (size_t)-1,
	    "utf8towcs should fail on \"\\xc1\\xff\"");
    TEST_PASS(utf8towcs(unicode, "\xc2" "abc", 2048) == (size_t)-1,
	    "utf8towcs should fail on \"\\xc2abc\"");
    TEST_PASS(utf8towcs(unicode, "abc" "\x80" "def", 2048) == (size_t)-1,
	    "utf8towcs should fail on bare \\x80");
    TEST_PASS(utf8towcs(unicode, "abc" "\xbf" "def", 2048) == (size_t)-1,
	    "utf8towcs should fail on bare \\x80");
    TEST_PASS(wcstoutf8(utf8, illegal_surrogates1, 2048) == (size_t)-1,
	    "wcstoutf8 should fail on surrogate characters");
    TEST_PASS(wcstoutf8(utf8, illegal_surrogates2, 2048) == (size_t)-1,
	    "wcstoutf8 should fail on surrogate characters");
    TEST_PASS(wcstoutf8(utf8, illegal_surrogates3, 2048) == (size_t)-1,
	    "wcstoutf8 should fail on surrogate characters");
#ifndef _WIN32
    /* Windows has 16bits wchars, so nothing above 0xffff can be found */
    TEST_PASS(wcstoutf8(utf8, high_unicode, 2048) == (size_t)-1,
	    "wcstoutf8 should fail on characters above 0x200000"); /*5+ bytes*/
#endif /* _WIN32 */
    return 0;
}

static int test_overlong_encoding() {
    wchar_t buf[2];
    return utf8towcs(buf, "\xc0\x80", 2) != (size_t)-1 /* 0x0 */
	|| utf8towcs(buf, "\xc0\xaf", 2) != (size_t)-1 /* 0x2f */
	|| utf8towcs(buf, "\xc1\xbf", 2) != (size_t)-1 /* 0x7f */
	|| utf8towcs(buf, "\xe0\x80\x80", 2) != (size_t)-1 /* 0x0 */
	|| utf8towcs(buf, "\xe0\x80\xaf", 2) != (size_t)-1 /* 0x2f */
	|| utf8towcs(buf, "\xe0\x81\xbf", 2) != (size_t)-1 /* 0x7f */
	|| utf8towcs(buf, "\xf0\x80\x80\x80", 2) != (size_t)-1 /* 0x0 */
	|| utf8towcs(buf, "\xf0\x80\x80\xaf", 2) != (size_t)-1 /* 0x2f */
	|| utf8towcs(buf, "\xf0\x80\x81\xbf", 2) != (size_t)-1; /* 0x7f */
}

static int test_short_destination() {
    return shortutf8towcs("abcdef")
	 + shortutf8towcs("\xc3\xa1\xc3\xa2\xc3\xa3\xc3\xa4\xc3\xa5\xc3\xa6")
	 + shortutf8towcs("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
}

static int test_wide_wchars() { /* larger than 16 bits */
#ifdef _WIN32
    wchar_t osmanya[] = {0xd801,0xdc80,0xd801,0xdc81,0xd801,0xdc82,0xd801,
			 0xdc83,0x0};
#else /* !_WIN32 */
    wchar_t osmanya[] = {0x10480,0x10481,0x10482,0x10483,0x0};
#endif /* !_WIN32 */
    return symcmpbin("Osmanya alef ba ta ja",
	    "\xf0\x90\x92\x80\xf0\x90\x92\x81\xf0\x90\x92\x82\xf0\x90\x92\x83",
	    osmanya);
}


TESTS(utf8_test)
    TEST(test_wchar_size);
    TEST(test_bidirectional_ascii);
    TEST(test_bidirectional_asian);
    TEST(test_bidirectional_european);
    TEST(test_invalid_character_handling);
    TEST(test_overlong_encoding);
    TEST(test_short_destination);
    TEST(test_wide_wchars);
ENDTESTS