1 // Copyright (c) 2010, Thomas Goyne <plorkyeran@aegisub.org>
2 //
3 // Permission to use, copy, modify, and distribute this software for any
4 // purpose with or without fee is hereby granted, provided that the above
5 // copyright notice and this permission notice appear in all copies.
6 //
7 // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8 // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9 // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10 // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11 // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12 // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13 // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14 
15 #include <libaegisub/charset_conv.h>
16 
17 #include <main.h>
18 
19 #include <cstdint>
20 #include <iconv.h>
21 
22 using namespace agi::charset;
23 
TEST(lagi_iconv,BasicSetup)24 TEST(lagi_iconv, BasicSetup) {
25 	EXPECT_NO_THROW(IconvWrapper("UTF-8", "UTF-16LE"));
26 }
27 
TEST(lagi_iconv,InvalidConversions)28 TEST(lagi_iconv, InvalidConversions) {
29 	EXPECT_THROW(IconvWrapper("nonexistent charset", "UTF-16LE"), UnsupportedConversion);
30 	EXPECT_THROW(IconvWrapper("UTF-16LE", "nonexistent charset"), UnsupportedConversion);
31 	EXPECT_THROW(IconvWrapper("nonexistent charset", "nonexistent charset"), UnsupportedConversion);
32 }
33 
TEST(lagi_iconv,StrLen1)34 TEST(lagi_iconv, StrLen1) {
35 	IconvWrapper conv("UTF-8", "UTF-8", false);
36 	for (int i = 0; i < 10; i++) {
37 		std::string str(i, ' ');
38 		ASSERT_EQ(i, conv.SrcStrLen(str.c_str()));
39 		ASSERT_EQ(i, conv.DstStrLen(str.c_str()));
40 	}
41 }
TEST(lagi_iconv,StrLen2)42 TEST(lagi_iconv, StrLen2) {
43 	IconvWrapper conv("UTF-16LE", "UTF-16LE", false);
44 	for (int i = 0; i < 10; i++) {
45 		std::basic_string<int16_t> str(i, ' ');
46 		ASSERT_EQ(2*i, conv.SrcStrLen((const char *)str.c_str()));
47 		ASSERT_EQ(2*i, conv.DstStrLen((const char *)str.c_str()));
48 	}
49 }
TEST(lagi_iconv,StrLen4)50 TEST(lagi_iconv, StrLen4) {
51 	IconvWrapper conv("UTF-32LE", "UTF-32LE", false);
52 	for (int i = 0; i < 10; i++) {
53 		std::basic_string<int32_t> str(i, ' ');
54 		ASSERT_EQ(4*i, conv.SrcStrLen((const char *)str.c_str()));
55 		ASSERT_EQ(4*i, conv.DstStrLen((const char *)str.c_str()));
56 	}
57 }
58 
59 #ifdef _LIBICONV_VERSION
TEST(lagi_iconv,Fallbacks)60 TEST(lagi_iconv, Fallbacks) {
61 	IconvWrapper nofallback("UTF-8", "Shift-JIS", false);
62 	IconvWrapper fallback("UTF-8", "Shift-JIS", true);
63 	IconvWrapper noneneeded("UTF-8", "UTF-16LE", false);
64 
65 	// Shift-JIS does not have a backslash
66 	EXPECT_THROW(nofallback.Convert("\\"), BadOutput);
67 	ASSERT_NO_THROW(fallback.Convert("\\"));
68 	EXPECT_EQ("\\", fallback.Convert("\\"));
69 	EXPECT_NO_THROW(noneneeded.Convert("\\"));
70 
71 	// BOM into non-unicode
72 	char bom[] = "\xEF\xBB\xBF";
73 	EXPECT_THROW(nofallback.Convert(bom), BadOutput);
74 	ASSERT_NO_THROW(fallback.Convert(bom));
75 	EXPECT_EQ("", fallback.Convert(bom));
76 	EXPECT_NO_THROW(noneneeded.Convert(bom));
77 
78 	// A snowman (U+2603)
79 	char snowman[] = "\xE2\x98\x83";
80 	EXPECT_THROW(nofallback.Convert(snowman), BadOutput);
81 	EXPECT_NO_THROW(noneneeded.Convert(snowman));
82 	ASSERT_NO_THROW(fallback.Convert(snowman));
83 	EXPECT_EQ("?", fallback.Convert(snowman));
84 }
85 
TEST(lagi_iconv,BadInput)86 TEST(lagi_iconv, BadInput) {
87 	IconvWrapper utf16("UTF-16LE", "UTF-8");
88 	EXPECT_THROW(utf16.Convert(" "), BadInput);
89 	IconvWrapper utf8("UTF-8", "UTF-16LE");
90 	EXPECT_THROW(utf8.Convert("\xE2\xFF"), BadInput);
91 }
92 #endif
93 
TEST(lagi_iconv,Conversions)94 TEST(lagi_iconv, Conversions) {
95 	IconvWrapper utf16le("UTF-16LE", "UTF-8", false);
96 	IconvWrapper utf16be("UTF-16BE", "UTF-8", false);
97 	IconvWrapper utf8("UTF-8", "UTF-16LE", false);
98 
99 	char space_utf8_[] = " ";
100 	char space_utf16be_[] = {0, 32, 0, 0};
101 	char space_utf16le_[] = {32, 0, 0, 0};
102 	std::string space_utf8(space_utf8_);
103 	std::string space_utf16be(space_utf16be_, 2);
104 	std::string space_utf16le(space_utf16le_, 2);
105 
106 	EXPECT_EQ(space_utf8, utf16le.Convert(space_utf16le));
107 	EXPECT_EQ(space_utf8, utf16be.Convert(space_utf16be));
108 	EXPECT_EQ(space_utf16le, utf8.Convert(space_utf8));
109 }
110 
111 // Basic overflow tests
TEST(lagi_iconv,Buffer)112 TEST(lagi_iconv, Buffer) {
113 	IconvWrapper conv("UTF-8", "UTF-16LE", false);
114 	char buff[32];
115 	memset(buff, 0xFF, sizeof(buff));
116 
117 	EXPECT_THROW(conv.Convert("", 1, buff, 0), BufferTooSmall);
118 	EXPECT_EQ('\xFF', buff[0]);
119 	EXPECT_THROW(conv.Convert("", 1, buff, 1), BufferTooSmall);
120 	EXPECT_EQ('\xFF', buff[0]);
121 	EXPECT_NO_THROW(conv.Convert("", 1, buff, 2));
122 	EXPECT_EQ('\0', buff[0]);
123 	EXPECT_EQ('\0', buff[1]);
124 	EXPECT_EQ('\xFF', buff[2]);
125 }
126 
TEST(lagi_iconv,LocalSupport)127 TEST(lagi_iconv, LocalSupport) {
128 	ASSERT_NO_THROW(IconvWrapper("UTF-8", ""));
129 	IconvWrapper conv("UTF-8", "");
130 	ASSERT_NO_THROW(conv.Convert(" "));
131 	EXPECT_EQ(" ", conv.Convert(" "));
132 }
TEST(lagi_iconv,wchar_tSupport)133 TEST(lagi_iconv, wchar_tSupport) {
134 	EXPECT_NO_THROW(IconvWrapper("UTF-8", "wchar_t"));
135 }
136 
TEST(lagi_iconv,Roundtrip)137 TEST(lagi_iconv, Roundtrip) {
138 	for (auto const& name : GetEncodingsList<std::vector<std::string>>()) {
139 		ASSERT_NO_THROW(IconvWrapper("utf-8", name.c_str()));
140 		ASSERT_NO_THROW(IconvWrapper(name.c_str(), "utf-8"));
141 		EXPECT_EQ(
142 			"Jackdaws love my big sphinx of quartz",
143 			IconvWrapper(name.c_str(), "utf-8").Convert(
144 				IconvWrapper("utf-8", name.c_str()).Convert(
145 					"Jackdaws love my big sphinx of quartz")));
146 	}
147 }
148 
TEST(lagi_iconv,Iso6937)149 TEST(lagi_iconv, Iso6937) {
150 	ASSERT_NO_THROW(IconvWrapper("UTF-8", "ISO-6937-2"));
151 	IconvWrapper subst("UTF-8", "ISO-6937-2");
152 	IconvWrapper no_subst("UTF-8", "ISO-6937-2", false);
153 
154 	// 7-bit is same as ISO-8859
155 	for (int i = 0; i < 128; ++i) {
156 		const char buf[] = { (char)i, 0 };
157 		std::string ret;
158 		EXPECT_NO_THROW(ret = subst.Convert(buf));
159 		EXPECT_STREQ(buf, ret.c_str());
160 	}
161 
162 	std::string ret;
163 
164 	// LATIN CAPITAL LETTER D WITH CARON (U+010E) - multibyte char in main block
165 	EXPECT_NO_THROW(ret = subst.Convert("\xC4\x8E"));
166 	EXPECT_STREQ("\xCF\x44", ret.c_str());
167 
168 	// BREVE - multibyte char in extended ranges
169 	EXPECT_NO_THROW(ret = subst.Convert("\xCB\x98"));
170 	EXPECT_STREQ("\xC6\x20", ret.c_str());
171 
172 	// EM DASH - single byte char in extended ranges
173 	EXPECT_NO_THROW(ret = subst.Convert("\xE2\x80\x94"));
174 	EXPECT_STREQ("\xD0", ret.c_str());
175 
176 	// codepoint not in ISO-6937-2
177 	EXPECT_NO_THROW(ret = subst.Convert("\xCB\x97"));
178 	EXPECT_STREQ("?", ret.c_str());
179 	EXPECT_THROW(no_subst.Convert("\xCB\x97"), agi::charset::BadOutput);
180 }
181