1 #include <QtCore/QScopedPointer>
2 #include <QtTest/QtTest>
3
4 #include <poppler-private.h>
5
6 #include <cstring>
7
8 #include "GlobalParams.h"
9 #include "UnicodeTypeTable.h"
10 #include "UTF.h"
11
12 class TestUTFConversion : public QObject
13 {
14 Q_OBJECT
15 public:
TestUTFConversion(QObject * parent=nullptr)16 explicit TestUTFConversion(QObject *parent = nullptr) : QObject(parent) { }
17 private slots:
18 void testUTF_data();
19 void testUTF();
20 void testUnicodeToAscii7();
21 void testUnicodeLittleEndian();
22 };
23
compare(const char * a,const char * b)24 static bool compare(const char *a, const char *b)
25 {
26 return strcmp(a, b) == 0;
27 }
28
compare(const uint16_t * a,const uint16_t * b)29 static bool compare(const uint16_t *a, const uint16_t *b)
30 {
31 while (*a && *b) {
32 if (*a++ != *b++)
33 return false;
34 }
35 return *a == *b;
36 }
37
compare(const Unicode * a,const char * b,int len)38 static bool compare(const Unicode *a, const char *b, int len)
39 {
40 for (int i = 0; i < len; i++) {
41 if (a[i] != (Unicode)b[i])
42 return false;
43 }
44
45 return true;
46 }
47
compare(const Unicode * a,const uint16_t * b,int len)48 static bool compare(const Unicode *a, const uint16_t *b, int len)
49 {
50 for (int i = 0; i < len; i++) {
51 if (a[i] != b[i])
52 return false;
53 }
54
55 return true;
56 }
testUTF_data()57 void TestUTFConversion::testUTF_data()
58 {
59 QTest::addColumn<QString>("s");
60
61 QTest::newRow("<empty>") << QString(QLatin1String(""));
62 QTest::newRow("a") << QStringLiteral("a");
63 QTest::newRow("abc") << QStringLiteral("abc");
64 QTest::newRow("Latin") << QStringLiteral("Vitrum edere possum; mihi non nocet");
65 QTest::newRow("Greek") << QStringLiteral("Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα");
66 QTest::newRow("Icelandic") << QStringLiteral("Ég get etið gler án þess að meiða mig");
67 QTest::newRow("Russian") << QStringLiteral("Я могу есть стекло, оно мне не вредит.");
68 QTest::newRow("Sanskrit") << QStringLiteral("काचं शक्नोम्यत्तुम् । नोपहिनस्ति माम् ॥");
69 QTest::newRow("Arabic") << QStringLiteral("أنا قادر على أكل الزجاج و هذا لا يؤلمني");
70 QTest::newRow("Chinese") << QStringLiteral("我能吞下玻璃而不伤身体。");
71 QTest::newRow("Thai") << QStringLiteral("ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ");
72 QTest::newRow("non BMP") << QStringLiteral("");
73 }
74
testUTF()75 void TestUTFConversion::testUTF()
76 {
77 char utf8Buf[1000];
78 char *utf8String;
79 uint16_t utf16Buf[1000];
80 uint16_t *utf16String;
81 int len;
82
83 QFETCH(QString, s);
84 char *str = strdup(s.toUtf8().constData());
85
86 // UTF-8 to UTF-16
87
88 len = utf8CountUtf16CodeUnits(str);
89 QCOMPARE(len, s.size()); // QString size() returns number of code units, not code points
90 Q_ASSERT(len < (int)sizeof(utf16Buf)); // if this fails, make utf16Buf larger
91
92 len = utf8ToUtf16(str, utf16Buf);
93 QVERIFY(compare(utf16Buf, s.utf16()));
94 QCOMPARE(len, s.size());
95
96 utf16String = utf8ToUtf16(str);
97 QVERIFY(compare(utf16String, s.utf16()));
98 free(utf16String);
99
100 std::string sUtf8(str);
101 std::unique_ptr<GooString> gsUtf16_a(utf8ToUtf16WithBom(sUtf8));
102 std::unique_ptr<GooString> gsUtf16_b(Poppler::QStringToUnicodeGooString(s));
103 QCOMPARE(gsUtf16_a->cmp(gsUtf16_b.get()), 0);
104
105 // UTF-16 to UTF-8
106
107 len = utf16CountUtf8Bytes(s.utf16());
108 QCOMPARE(len, (int)strlen(str));
109 Q_ASSERT(len < (int)sizeof(utf8Buf)); // if this fails, make utf8Buf larger
110
111 len = utf16ToUtf8(s.utf16(), utf8Buf);
112 QVERIFY(compare(utf8Buf, str));
113 QCOMPARE(len, (int)strlen(str));
114
115 utf8String = utf16ToUtf8(s.utf16());
116 QVERIFY(compare(utf8String, str));
117 free(utf8String);
118
119 free(str);
120 }
121
testUnicodeToAscii7()122 void TestUTFConversion::testUnicodeToAscii7()
123 {
124 globalParams = std::make_unique<GlobalParams>();
125
126 // Test string is one 'Registered' and twenty 'Copyright' chars
127 // so it's long enough to reproduce the bug given that glibc
128 // malloc() always returns 8-byte aligned memory addresses.
129 GooString *goo = Poppler::QStringToUnicodeGooString(QString::fromUtf8("®©©©©©©©©©©©©©©©©©©©©")); // clazy:exclude=qstring-allocations
130
131 Unicode *in;
132 const int in_len = TextStringToUCS4(goo->toStr(), &in);
133
134 delete goo;
135
136 int in_norm_len;
137 int *in_norm_idx;
138 Unicode *in_norm = unicodeNormalizeNFKC(in, in_len, &in_norm_len, &in_norm_idx, true);
139
140 free(in);
141
142 Unicode *out;
143 int out_len;
144 int *out_ascii_idx;
145
146 unicodeToAscii7(in_norm, in_norm_len, &out, &out_len, in_norm_idx, &out_ascii_idx);
147
148 free(in_norm);
149 free(in_norm_idx);
150
151 // ascii7 conversion: ® -> (R) © -> (c)
152 const char *expected_ascii = (char *)"(R)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)";
153
154 QCOMPARE(out_len, (int)strlen(expected_ascii));
155 QVERIFY(compare(out, expected_ascii, out_len));
156
157 free(out);
158 free(out_ascii_idx);
159 }
160
testUnicodeLittleEndian()161 void TestUTFConversion::testUnicodeLittleEndian()
162 {
163 uint16_t UTF16LE_hi[5] { 0xFFFE, 0x4800, 0x4900, 0x2100, 0x1126 }; // UTF16-LE "HI!☑"
164 std::string GooUTF16LE(reinterpret_cast<const char *>(UTF16LE_hi), sizeof(UTF16LE_hi));
165
166 uint16_t UTF16BE_hi[5] { 0xFEFF, 0x0048, 0x0049, 0x0021, 0x2611 }; // UTF16-BE "HI!☑"
167 std::string GooUTF16BE(reinterpret_cast<const char *>(UTF16BE_hi), sizeof(UTF16BE_hi));
168
169 // Let's assert both GooString's are different
170 QVERIFY(GooUTF16LE != GooUTF16BE);
171
172 Unicode *UCS4fromLE, *UCS4fromBE;
173 const int len1 = TextStringToUCS4(GooUTF16LE, &UCS4fromLE);
174 const int len2 = TextStringToUCS4(GooUTF16BE, &UCS4fromBE);
175
176 // len is 4 because TextStringToUCS4() removes the two leading Byte Order Mark (BOM) code points
177 QCOMPARE(len1, len2);
178 QCOMPARE(len1, 4);
179
180 // Check that now after conversion, UCS4fromLE and UCS4fromBE are now the same
181 for (int i = 0; i < len1; i++) {
182 QCOMPARE(UCS4fromLE[i], UCS4fromBE[i]);
183 }
184
185 const QString expected = QStringLiteral("HI!☑");
186
187 // Do some final verifications, checking the strings to be "HI!"
188 QVERIFY(*UCS4fromLE == *UCS4fromBE);
189 QVERIFY(compare(UCS4fromLE, expected.utf16(), len1));
190 QVERIFY(compare(UCS4fromBE, expected.utf16(), len1));
191 }
192
193 QTEST_GUILESS_MAIN(TestUTFConversion)
194 #include "check_utf_conversion.moc"
195