1 #include <QtCore/QScopedPointer>
2 #include <QtTest/QtTest>
3 
4 #include <poppler-private.h>
5 
6 #include <cstring>
7 #include <cstdint> // for uint16_t
8 
9 #include "GlobalParams.h"
10 #include "UnicodeTypeTable.h"
11 #include "UTF.h"
12 
13 class TestUTFConversion : public QObject
14 {
15     Q_OBJECT
16 public:
TestUTFConversion(QObject * parent=nullptr)17     explicit TestUTFConversion(QObject *parent = nullptr) : QObject(parent) { }
18 private slots:
19     void testUTF_data();
20     void testUTF();
21     void testUnicodeToAscii7();
22     void testUnicodeLittleEndian();
23 };
24 
compare(const char * a,const char * b)25 static bool compare(const char *a, const char *b)
26 {
27     return strcmp(a, b) == 0;
28 }
29 
compare(const uint16_t * a,const uint16_t * b)30 static bool compare(const uint16_t *a, const uint16_t *b)
31 {
32     while (*a && *b) {
33         if (*a++ != *b++)
34             return false;
35     }
36     return *a == *b;
37 }
38 
compare(const Unicode * a,const char * b,int len)39 static bool compare(const Unicode *a, const char *b, int len)
40 {
41     for (int i = 0; i < len; i++) {
42         if (a[i] != (Unicode)b[i])
43             return false;
44     }
45 
46     return true;
47 }
48 
compare(const Unicode * a,const uint16_t * b,int len)49 static bool compare(const Unicode *a, const uint16_t *b, int len)
50 {
51     for (int i = 0; i < len; i++) {
52         if (a[i] != b[i])
53             return false;
54     }
55 
56     return true;
57 }
58 
testUTF_data()59 void TestUTFConversion::testUTF_data()
60 {
61     QTest::addColumn<QString>("s");
62 
63     QTest::newRow("<empty>") << QString(QLatin1String(""));
64     QTest::newRow("a") << QStringLiteral("a");
65     QTest::newRow("abc") << QStringLiteral("abc");
66     QTest::newRow("Latin") << QStringLiteral("Vitrum edere possum; mihi non nocet");
67     QTest::newRow("Greek") << QStringLiteral("Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα");
68     QTest::newRow("Icelandic") << QStringLiteral("Ég get etið gler án þess að meiða mig");
69     QTest::newRow("Russian") << QStringLiteral("Я могу есть стекло, оно мне не вредит.");
70     QTest::newRow("Sanskrit") << QStringLiteral("काचं शक्नोम्यत्तुम् । नोपहिनस्ति माम् ॥");
71     QTest::newRow("Arabic") << QStringLiteral("أنا قادر على أكل الزجاج و هذا لا يؤلمني");
72     QTest::newRow("Chinese") << QStringLiteral("我能吞下玻璃而不伤身体。");
73     QTest::newRow("Thai") << QStringLiteral("ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ");
74     QTest::newRow("non BMP") << QStringLiteral("��������������");
75 }
76 
testUTF()77 void TestUTFConversion::testUTF()
78 {
79     char utf8Buf[1000];
80     char *utf8String;
81     uint16_t utf16Buf[1000];
82     uint16_t *utf16String;
83     int len;
84 
85     QFETCH(QString, s);
86     char *str = strdup(s.toUtf8().constData());
87 
88     // UTF-8 to UTF-16
89 
90     len = utf8CountUtf16CodeUnits(str);
91     QCOMPARE(len, s.size()); // QString size() returns number of code units, not code points
92     Q_ASSERT(len < (int)sizeof(utf16Buf)); // if this fails, make utf16Buf larger
93 
94     len = utf8ToUtf16(str, utf16Buf);
95     QVERIFY(compare(utf16Buf, s.utf16()));
96     QCOMPARE(len, s.size());
97 
98     utf16String = utf8ToUtf16(str);
99     QVERIFY(compare(utf16String, s.utf16()));
100     free(utf16String);
101 
102     std::string sUtf8(str);
103     std::unique_ptr<GooString> gsUtf16_a(utf8ToUtf16WithBom(sUtf8));
104     std::unique_ptr<GooString> gsUtf16_b(Poppler::QStringToUnicodeGooString(s));
105     QCOMPARE(gsUtf16_a->cmp(gsUtf16_b.get()), 0);
106 
107     // UTF-16 to UTF-8
108 
109     len = utf16CountUtf8Bytes(s.utf16());
110     QCOMPARE(len, (int)strlen(str));
111     Q_ASSERT(len < (int)sizeof(utf8Buf)); // if this fails, make utf8Buf larger
112 
113     len = utf16ToUtf8(s.utf16(), utf8Buf);
114     QVERIFY(compare(utf8Buf, str));
115     QCOMPARE(len, (int)strlen(str));
116 
117     utf8String = utf16ToUtf8(s.utf16());
118     QVERIFY(compare(utf8String, str));
119     free(utf8String);
120 
121     free(str);
122 }
123 
testUnicodeToAscii7()124 void TestUTFConversion::testUnicodeToAscii7()
125 {
126     globalParams = std::make_unique<GlobalParams>();
127 
128     // Test string is one 'Registered' and twenty 'Copyright' chars
129     // so it's long enough to reproduce the bug given that glibc
130     // malloc() always returns 8-byte aligned memory addresses.
131     GooString *goo = Poppler::QStringToUnicodeGooString(QString::fromUtf8("®©©©©©©©©©©©©©©©©©©©©")); // clazy:exclude=qstring-allocations
132 
133     Unicode *in;
134     const int in_len = TextStringToUCS4(goo->toStr(), &in);
135 
136     delete goo;
137 
138     int in_norm_len;
139     int *in_norm_idx;
140     Unicode *in_norm = unicodeNormalizeNFKC(in, in_len, &in_norm_len, &in_norm_idx, true);
141 
142     free(in);
143 
144     Unicode *out;
145     int out_len;
146     int *out_ascii_idx;
147 
148     unicodeToAscii7(in_norm, in_norm_len, &out, &out_len, in_norm_idx, &out_ascii_idx);
149 
150     free(in_norm);
151     free(in_norm_idx);
152 
153     // ascii7 conversion: ® -> (R)   © -> (c)
154     const char *expected_ascii = (char *)"(R)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)(c)";
155 
156     QCOMPARE(out_len, (int)strlen(expected_ascii));
157     QVERIFY(compare(out, expected_ascii, out_len));
158 
159     free(out);
160     free(out_ascii_idx);
161 }
162 
testUnicodeLittleEndian()163 void TestUTFConversion::testUnicodeLittleEndian()
164 {
165     uint16_t UTF16LE_hi[5] { 0xFFFE, 0x4800, 0x4900, 0x2100, 0x1126 }; // UTF16-LE "HI!☑"
166     std::string GooUTF16LE(reinterpret_cast<const char *>(UTF16LE_hi), sizeof(UTF16LE_hi));
167 
168     uint16_t UTF16BE_hi[5] { 0xFEFF, 0x0048, 0x0049, 0x0021, 0x2611 }; // UTF16-BE "HI!☑"
169     std::string GooUTF16BE(reinterpret_cast<const char *>(UTF16BE_hi), sizeof(UTF16BE_hi));
170 
171     // Let's assert both GooString's are different
172     QVERIFY(GooUTF16LE != GooUTF16BE);
173 
174     Unicode *UCS4fromLE, *UCS4fromBE;
175     const int len1 = TextStringToUCS4(GooUTF16LE, &UCS4fromLE);
176     const int len2 = TextStringToUCS4(GooUTF16BE, &UCS4fromBE);
177 
178     // len is 4 because TextStringToUCS4() removes the two leading Byte Order Mark (BOM) code points
179     QCOMPARE(len1, len2);
180     QCOMPARE(len1, 4);
181 
182     // Check that now after conversion, UCS4fromLE and UCS4fromBE are now the same
183     for (int i = 0; i < len1; i++) {
184         QCOMPARE(UCS4fromLE[i], UCS4fromBE[i]);
185     }
186 
187     const QString expected = QString::fromUtf8("HI!☑"); // clazy:exclude=qstring-allocations
188 
189     // Do some final verifications, checking the strings to be "HI!"
190     QVERIFY(*UCS4fromLE == *UCS4fromBE);
191     QVERIFY(compare(UCS4fromLE, expected.utf16(), len1));
192     QVERIFY(compare(UCS4fromBE, expected.utf16(), len1));
193 }
194 
195 QTEST_GUILESS_MAIN(TestUTFConversion)
196 #include "check_utf_conversion.moc"
197